def get_page(page): content = '' imgurl = '' title = '' img_binary = '' tags = [] explain = '' try: content = urllib2.urlopen(page, timeout=2).read() try: soup = bsoup(content, 'lxml') except: soup = bsoup(content, 'html.parser') tmp1 = soup.findAll('div', {'class': 'detail_image'})[0] tmp2 = soup.find('div', {'id': 'miaoshu'}) tmp3 = soup.find('div', {'id': 'detail_additional'}).contents[-2] tags = re.findall("<a href=.*?>([^<]*)?</a>", unicode(tmp3)) imgurl = tmp1.img.get('src', '') title = tmp1.img.get('title', '') explain = tmp2.p.text if explain == '' or explain == None: explain = title explain = explain.replace('\r\n', '').replace('\n', '') title = simplify(title) req = urllib2.Request(url=imgurl, headers={"Referer": page}) img_binary = urllib2.urlopen(req).read() except Exception, e: print e return False, '', '', '', '', ''
def openSoup(x): """Opens URL and create soup""" try: return bsoup(urllib2.urlopen(x).read()) except urllib2.HTTPError, e: print 'Taking a breather...' time.sleep(120) return bsoup(urllib2.urlopen(x).read())
def read_file(self, filename): """ Reads a single SemCor file in NLP Annotation Format (NAF). """ with io.open(filename, 'r') as fin: print filename xml = fin.read( ) # What happen if IOError: [Errno 5] Input/output error?? WTF is Errno5 !!@$!%#@^$& ???!!!!@#$!%@ text = bsoup(xml).find('text') terms = bsoup(xml).find('terms') sentences = defaultdict(list) paragraphs = defaultdict(list) # Gets the term layer. termid2sense = {} for term in terms.findAll('term'): termid = int(term.get('id')[1:]) term_sense = None try: sense = term.findAll('externalref')[-1].get('reference') term_sense = sense[6:] if sense.startswith( 'eng30-') else sense except: pass t = Term(termid, term.get('pos'), term.get('lemma'), term_sense, term.get('type')) termid2sense[termid] = t # Gets the text layer. wordid2meta = {} for word in text.findAll('wf'): wordid = int(word.get('id')[1:]) sentid = int(word.get('sent')) paraid = int(word.get('para')) try: term = termid2sense[wordid] except: # TODO: please check that all 'words' with term annotation # are punctuation. term = Term(id=wordid, pos=u'PUNCT', lemma=None, sense=None, type=u'punct') w = Word(wordid, word.text, sentid, paraid, term) wordid2meta[wordid] = w sentences[sentid].append(wordid) paragraphs[paraid].append(sentid) return wordid2meta, termid2sense, sentences, paragraphs
def read_file(self, filename): """ Reads a single SemCor file in NLP Annotation Format (NAF). """ with io.open(filename, 'r') as fin: print filename xml = fin.read() # What happen if IOError: [Errno 5] Input/output error?? WTF is Errno5 !!@$!%#@^$& ???!!!!@#$!%@ text = bsoup(xml).find('text') terms = bsoup(xml).find('terms') sentences = defaultdict(list) paragraphs = defaultdict(list) # Gets the term layer. termid2sense = {} for term in terms.findAll('term'): termid = int(term.get('id')[1:]) term_sense = None try: sense = term.findAll('externalref')[-1].get('reference') term_sense = sense[6:] if sense.startswith('eng30-') else sense except: pass t = Term(termid, term.get('pos'), term.get('lemma'), term_sense, term.get('type')) termid2sense[termid] = t # Gets the text layer. wordid2meta = {} for word in text.findAll('wf'): wordid = int(word.get('id')[1:]) sentid = int(word.get('sent')) paraid = int(word.get('para')) try: term = termid2sense[wordid] except: # TODO: please check that all 'words' with term annotation # are punctuation. term=Term(id=wordid, pos=u'PUNCT', lemma=None, sense=None, type=u'punct') w = Word(wordid, word.text, sentid, paraid, term) wordid2meta[wordid] = w sentences[sentid].append(wordid) paragraphs[paraid].append(sentid) return wordid2meta, termid2sense, sentences, paragraphs
def parseFiles(annotationsPath): objectList = [] # Retrieves all the files in a directory and checks if they are xml annotationsFullPath = os.path.abspath(annotationsPath) fileList = os.listdir(annotationsFullPath) if len(fileList) > 0: lastFile = '' for file in fileList: fileTypeMatch = re.search('.xml',file) if fileTypeMatch: print "Processing file: " + file try: filePath = os.path.join(annotationsFullPath, file) f = open(filePath) soup = bsoup(f) f.close() # Finds the object of all xml files and places the objects into a list # and returns it. parsedXML = (soup.findAll('name')) for object in parsedXML: match = re.search('(<name>)(\w+)(</name>)', str(object)) objectList += match.group(2), except IOError: sys.stderr.write("There was a problem with file: " + file + '\n') else: sys.stderr.write("Error - No xml files found.") sys.exit(1) return objectList
def get_salt_version(url, args, prev_branch=False): if 'windows' not in url and 'osx' not in url and prev_branch: url = url.replace(url.split('/')[-1], args) get_url = requests.get(url) ret_code = get_url.status_code if ret_code != 200: print('Attempt to query url failed with http error code: {0}'.format( ret_code)) sys.exit(1) html = get_url.content parse_html = bsoup(html) pkg_name = 'salt-master' if 'osx' in url: try: pkg_name = 'salt-{0}'.format(args.branch) except AttributeError: pkg_name = 'salt-{0}'.format(args) if 'windows' in url: try: pkg_name = 'Salt-Minion-{0}'.format(args.branch) except AttributeError: pkg_name = 'Salt-Minion-{0}'.format(args) for tag in parse_html.findAll(attrs={'href': re.compile(pkg_name + ".*")}): match = re.search("([0-9]{1,4}\.)([0-9]{1,2}\.)([0-9]{1,2})", str(tag)) salt_ver = (match.group(0)) return salt_ver
def parse_html_method(tab_os, os_v, args): ''' Parse the index.html for install commands ''' # Get and Parse url variables if args.branch != LATEST: url = 'https://repo.saltstack.com/staging/{0}.html'.format(args.branch) else: url = 'https://repo.saltstack.com/staging/index.html'.format() get_url = requests.get(url) html = get_url.content parse_html = bsoup(html) os_instruction = [] for tag in parse_html.findAll(attrs={'id' : tab_os}): # grab all instructions for a specific os and release # for example grab debian7 for latest release for tab_os_v in tag.findAll(attrs={'class': re.compile(os_v + ".*")}): for cmd in tab_os_v.findAll(attrs={'class': 'language-bash'}): if cmd not in os_instruction: os_instruction.append(cmd) for cmd_2 in tab_os_v.findAll(attrs={'class': 'language-ini'}): if cmd_2 not in os_instruction: os_instruction.append(cmd_2) # get all instructions that run on both veresion of each os_family for cmd_all in tag.findAll('code', attrs={'class': None}): if cmd_all not in os_instruction: os_instruction.append(cmd_all) return os_instruction
def parseFiles(annotationsPath,objectType): tagList = [] # Retrieves all the files in a directory and checks if they are xml annotationsFullPath = os.path.abspath(annotationsPath) fileList = os.listdir(annotationsFullPath) for file in fileList: fileTypeMatch = re.search('.xml',file) if fileTypeMatch: print "Processing file: " + file try: filePath = os.path.join(annotationsFullPath, file) f = open(filePath) soup = bsoup(f) f.close() # Finds the object of all xml files and places the objects into a list # and returns it. parsedXML = (soup.findAll('name')) if objectType == 'all': for object in parsedXML: tagList.append(addToTagList(soup)) elif objectType in ('car','person','bicycle'): for object in parsedXML: match = re.search('(<name>)(\w+)(</name>)', str(object)) if match.group(2) == objectType: tagList.append(addToTagList(soup)) except IOError: sys.stderr.write('There was a problem with file: ' + file + '\n') return tagList
def test_instances(self): """ Returns the test instances from SemEval2007 Coarse-grain WSD task. >>> coarse_wsd = SemEval2007_Coarse_WSD() >>> inst2ans = coarse_wsd.get_answers() >>> for inst in inst2ans: ... print inst, inst2ans[inst] ... break d004.s073.t013 answer(sensekey=[u'pointer%1:06:01::', u'pointer%1:06:00::', u'pointer%1:10:00::'], lemma=u'pointer', pos=u'n') """ Instance = namedtuple('instance', 'id, lemma, word') test_file = io.open(self.test_file, 'r').read() inst2ans = self.get_answers() for text in bsoup(test_file).findAll('text'): textid = text['id'] document = " ".join([remove_tags(i) for i in str(text).split('\n') if remove_tags(i)]) for sent in text.findAll('sentence'): sentence = " ".join([remove_tags(i) for i in str(sent).split('\n') if remove_tags(i)]) for instance in sent.findAll('instance'): instid = instance['id'] lemma = instance['lemma'] word = instance.text inst = Instance(instid, lemma, word) yield inst, inst2ans[instid], unicode(sentence), unicode(document)
def parseWikiContent(text): soup = bsoup(text) # check exists noarticle = soup('div', {"class": "noarticletext"}) if len(noarticle) != 0: print 'not exist!!!' return None pSet = soup('div', {'id': 'mw-content-text'})[0].findChildren('p', recursive=False) loops = 3 contents = '' for p in pSet: if loops == 0: break #print p content = p.getText() #print content if len(content) >= 4 and content[0:6].find(u'坐标') == -1: content = filterInvalidChar(pattern, content) contents += content.encode('utf-8') + '\n' loops -= 1 if len(contents) > 0: return contents else: return None
def parseFiles(annotationsPath,objectType): orientationDict = {'car':[0,0,0,0,0],'person':[0,0,0,0,0],'bicycle':[0,0,0,0,0]} tempType='' # Creates two lists and an object in one list corresponds to the orientation in the other list based on position. parsedObjectXMLList = [] parsedOrientationXMLList = [] # Retrieves all the files in a directory and checks if they are xml fileList = os.listdir(annotationsPath) annotationsFullPath = os.path.abspath(annotationsPath) for file in fileList: fileTypeMatch = re.search('.xml',file) if fileTypeMatch: print "Processing file: " + file try: filePath = os.path.join(annotationsFullPath, file) f = open(filePath) soup = bsoup(f) f.close() # Finds the object of all xml files and places the objects into a list # and returns it. parsedObjectXML = (soup.findAll('name')) parsedOrientationXML = soup.pose.string for object in parsedObjectXML: match = re.search('(<name>)(\w+)(</name>)', str(object)) object = match.group(2) if objectType == 'all': parsedObjectXMLList += object, parsedOrientationXMLList += str(parsedOrientationXML), elif objectType == 'car' and object == 'car': parsedObjectXMLList += object, parsedOrientationXMLList += str(parsedOrientationXML), elif objectType == 'person' and object == 'person': parsedObjectXMLList += object, parsedOrientationXMLList += str(parsedOrientationXML), elif objectType == 'bicycle' and object == 'bicycle': parsedObjectXMLList += object, parsedOrientationXMLList += str(parsedOrientationXML), except IOError: sys.stderr.write('There was a problem with file: ' + file + '/n') for x in range (0,len(parsedObjectXMLList)): if objectType == 'all': tempType = objectType objectType = parsedObjectXMLList[x] if parsedObjectXMLList[x] == objectType: if parsedOrientationXMLList[x] == 'Left': (orientationDict[objectType])[0]+=1 elif parsedOrientationXMLList[x] == 'Right': (orientationDict[objectType])[1]+=1 elif parsedOrientationXMLList[x] == 'Frontal': (orientationDict[objectType])[2]+=1 elif parsedOrientationXMLList[x] == 'Rear': (orientationDict[objectType])[3]+=1 elif parsedOrientationXMLList[x] == 'Unspecified': (orientationDict[objectType])[4]+=1 if tempType == 'all': objectType = tempType return orientationDict
def FakeResponse(a): test = file("backend/test/data/inventory/test_20120310_055847_infibeam.html", "r").read() test_data = str(bsoup(test).fetch('ul', 'srch_result portrait')[0].fetch('li')[0]) #monkey patching test-data to get the correct minimal test-data test_data = str("<ul class='srch_result portrait'>" + test_data + "</ul>") return '200 OK', test_data
def remove_images(data_html): """ remove occurences of images from data_html if there are any links that do not have any text, will also be removed """ soup = bsoup(data_html) # remove all images for image in soup.findAll("img"): image.replaceWith('') try: logger.debug('removed img: %s' % (image["src"])) except KeyError: logger.debug('removed img: %s' % ("image link was not available")) # remove links to images or to anything, without any text # eg: <a href='http://link/to/some-page'></a> # following will be left as it is: # <a href='http://link/to/some-page'>some text</a> for a in soup.findAll('a'): if not a.renderContents().strip(): a.replaceWith('') logger.debug('removed a tag containing: %s' % (a)) return smart_unicode(soup.renderContents())
def organizeImageInfo(annotationsFileList,photoFileList,annotationsFullPath,photoFullPath, classes, orientation, tags): size = determinePhotoSize(classes,orientation) imageDict = {} root = Tk() annotationsSet = Set(annotationsFileList) for photo in photoFileList: photoMatch = re.search('(2014_)(\w+)(.png)',photo) if photoMatch: xml = str(photoMatch.group(1)) + str(photoMatch.group(2)) + '.xml' if xml in annotationsSet: xmlPath = os.path.join(annotationsFullPath, xml) f = open(xmlPath) soup = bsoup(f) f.close() nameTagList,truncationTagList,occludedTagList,poseTagList,xminTagList,yminTagList,xmaxTagList,ymaxTagList = parseXML(soup) for name,truncation,occluded,pose,xmin,ymin,xmax,ymax in zip(nameTagList,truncationTagList,occludedTagList,poseTagList,xminTagList,yminTagList,xmaxTagList,ymaxTagList): print "Processing file: " + xml if classes.lower() == name[1]: if orientation.lower() in (pose[1].lower(),'all'): if tags.lower() == 'all' or tags.lower() == 'none' and int(truncation[1]) == 0 and int(occluded[1]) == 0 or tags.lower() == 'occluded' and int(occluded[1]) or tags.lower() == 'truncated' and int(truncation[1]) or tags.lower() == 'occluded and truncated' and int(occluded[1]) and int(truncation[1]): print "Match found in: " + photo photoPath = os.path.join(photoFullPath, photo) image = Image.open(photoPath) image = image.crop((int(xmin[1]),int(ymin[1]),int(xmax[1]),int(ymax[1]))) image = image.resize(size) image = ImageTk.PhotoImage(image) imageDict[image] = xml return imageDict,root,size
def read_file(self, filename): """ Reads a single SemCor file in NLP Annotation Format (NAF). """ with io.open(filename, 'r') as fin: xml = fin.read() text = bsoup(xml).find('text') terms = bsoup(xml).find('terms') sentences = defaultdict(list) paragraphs = defaultdict(list) # Gets the term layer. Term = namedtuple('term', 'id, pos, lemma, sense, type') termid2sense = {} for term in terms.findAll('term'): termid = int(term.get('id')[1:]) term_sense = None try: sense = term.findAll('externalref')[-1].get('reference') term_sense = sense[6:] if sense.startswith('eng30-') else sense except: pass t = Term(termid, term.get('pos'), term.get('lemma'), term_sense, term.get('type')) termid2sense[termid] = t # Gets the text layer. Word = namedtuple('word', 'id, text, offset, sentid, paraid, term') wordid2meta = {} for word in text.findAll('wf'): wordid = int(word.get('id')[1:]) sentid = int(word.get('sent')) paraid = int(word.get('para')) try: term = termid2sense[wordid] except: term = None w = Word(wordid, word.text, word.get('offset'), sentid, paraid, term) wordid2meta[wordid] = w sentences[sentid].append(wordid) paragraphs[paraid].append(sentid) return wordid2meta, termid2sense, sentences, paragraphs
def _get_page(url, delay=2): conf = config.Configuration() global _timer while (datetime.datetime.now() - _timer).seconds < delay: time.sleep(.25) _timer = datetime.datetime.now() if conf.debug: print 'Fetching: %s' % url return bsoup(urlopen(url).read())
def get_salt_version(url): get_url = requests.get(url) html = get_url.content parse_html = bsoup(html) for tag in parse_html.findAll( attrs={'href': re.compile('salt-master' + ".*")}): match = re.search("([0-9]{1,4}\.)([0-9]{1,2}\.)([0-9]{1,2})", str(tag)) salt_ver = (match.group(0)) return salt_ver
def collectObjectArea(annotationsPath,objectClass,noTruncated,noOcclusion): areaList = [] truncationArea = [] occlusionArea = [] # Retrieves all the files in a directory and checks if they are xml annotationsFullPath = os.path.abspath(annotationsPath) fileList = os.listdir(annotationsFullPath) for file in fileList: fileTypeMatch = re.search('.xml',file) if fileTypeMatch: print "Processing file: " + file try: filePath = os.path.join(annotationsFullPath, file) f = open(filePath) soup = bsoup(f) f.close() parsedXML = (soup.findAll('name')) # Finds the object of all xml files and checks if it is a part of objectClass. if objectClass == 'all': for photo in parsedXML: for object in photo: truncatedMatch = int(soup.truncated.string) occlusionMatch = int(soup.occluded.string) if not truncatedMatch and not occlusionMatch: calculateArea(soup,areaList) if truncatedMatch and not occlusionMatch: calculateArea(soup,truncationArea) if occlusionMatch and not truncatedMatch: calculateArea(soup,occlusionArea) if occlusionMatch and truncatedMatch: calculateArea(soup,occlusionArea) else: for photo in parsedXML: for object in photo: truncatedMatch = int(soup.truncated.string) occlusionMatch = int(soup.occluded.string) # For all objects of the type that the user specifies, area is # calculated and added to a list. if object == objectClass: if not truncatedMatch and not occlusionMatch: calculateArea(soup,areaList) if truncatedMatch and not occlusionMatch: calculateArea(soup,truncationArea) if occlusionMatch and not truncatedMatch: calculateArea(soup,occlusionArea) if occlusionMatch and truncatedMatch: calculateArea(soup,occlusionArea) except IOError: sys.stderr.write('There was a problem with file: ' + file +'\n') if noTruncated is False: areaList += truncationArea if noOcclusion is False: areaList += occlusionArea return areaList
def order(self,data1): s = requests.Session() url = "http://desktop.nju.edu.cn:8080/jiaowu/login.do" r1 = s.post(url,data=data1,headers=headers) soup = bsoup(r1.text) name = soup.find(id='UserInfo') print(str(data1['password'])) if name is not None: file1.write("学号:"+str(data1['userName'])+" 密码:"+str(data1['password'])+"\n") print("学号:"+str(data1['userName'])+" 密码:"+str(data1['password'])+"\n") exit(1)
def yield_sentences(self): test_file = io.open(self.test_file, 'r').read() inst2ans = self.get_answers() for text in bsoup(test_file).findAll('text'): if not text: continue textid = text['id'] context_doc = " ".join([remove_tags(i) for i in str(text).split('\n') if remove_tags(i)]) for sent in text.findAll('sentence'): context_sent = " ".join([remove_tags(i) for i in str(sent).split('\n') if remove_tags(i)]) yield sent, context_sent, context_doc, inst2ans, textid
def _get(self, url, cookie_update=False): """ General HTTP Get function. Requires a URL. """ if cookie_update: self.cookie = "" headers = {"Cookie": self.cookie} http = self.con(self.host) http.request("GET", url, headers=headers) resp = http.getresponse() page = bsoup(resp.read()) if cookie_update: self._set_cookie(resp) return page
def get_yahoo_xml_photo_news(data_html, headline): """scans the data for img tags and replaces them with yahoo nsml standard newsitem photo tags if the img does not exist at the src, it will be removed""" soup = bsoup(data_html) image_count = 0 return_tag_string = "" for image in soup.findAll("img"): new_img_tag = '' src = '' try: # extract the elements from img tag if image.has_key("src"): src = image["src"] filename = src.split("/")[-1] alt = "" # default to blank if image.has_key("alt"): alt = image["alt"] elif image.has_key("title"): alt = image["title"] if image.has_key("width") and image.has_key("height"): width = image["width"] height = image["height"] type, mime = get_media_mime_type(filename) new_img_tag = """<NewsComponent Duid="photo%s">""" % image_count + \ """<Role FormalName="Photo"/>""" + \ """<NewsLines><HeadLine>%s</HeadLine></NewsLines>""" % headline + \ """<DescriptiveMetadata><Language FormalName="en"/>""" + \ """</DescriptiveMetadata><NewsComponent>""" + \ """<Role FormalName="Caption"/><ContentItem>""" + \ """<MediaType FormalName="Text"/>""" + \ """<Format FormalName="NITF"/><DataContent><nitf><body>""" + \ """<body.content><p>%s</p></body.content>""" % alt + \ """</body></nitf></DataContent></ContentItem></NewsComponent>""" + \ """<NewsComponent Duid="base%s">""" %image_count + \ """<Role FormalName="BaseImage"/>""" + \ """<ContentItem Href="%s">""" % src + \ """<MediaType FormalName="Photo"/>""" + \ """<Format FormalName="JPEG Baseline"/>""" + \ """<MimeType FormalName="%s"/>""" % mime + \ """</ContentItem></NewsComponent></NewsComponent>""" image_count += 1 except ContifyValidationError, e: # move on to the next image, catch the exception - log it and move on logger.error(e) finally:
def convert_html_to_inline(html): """ Parses an html document and substitutes img tags with inlined base64 encoded images Arguments: - `html`: An html, represented as a str object """ soup = bsoup(html) for tag in soup.findAll('img'): new_tag = convert_imgtag_to_base64(unicode(tag)) tag.replaceWith(new_tag) return soup
def convert_imgtag_to_base64(tag): """ Returns an image tag with the URI substituted by a base64 representation of the resource Arguments: - `tag`: A BeautifulSoup object representing an img tag """ assert isinstance(tag, basestring) soup = bsoup(tag) tag = soup.contents[0] img_uri = tag['src'] mime_type, data = get_image(img_uri) tag['src'] = "data:%s;base64,%s" % (mime_type, data) return tag
def download_users_and_parse(url): web_content = urllib.urlopen(url) html_data = web_content.read() web_content.close() data = bsoup(html_data).findAll('h2', attrs={'class':['user-leaderboard-list-name']}) user_list = [] for item in data: user_list.append(item.findAll('a')[0]['href'].replace('/', '')) with open('../output/trending-users.json', 'w') as output: json.dump(user_list, output) return user_list
def _post(self, url, payload, cookie_update=False): """ General HTTP post function. Requires a url and a payload. """ body = urllib.urlencode(payload) headers = { "Cookie": self.cookie, "Content-Length": len(body), "Content-Type": "application/x-www-form-urlencoded", } http = self.con(self.host) http.request("POST", url, body, headers) resp = http.getresponse() page = bsoup(resp.read()) if cookie_update: self._set_cookie(resp) return page
def img_to_yahoo_media( data_html, producer, prefix=None, ): """ TODO: * Height and width being ignored * All a href to images being ignored * Image Caption - if image within divs is not being considered scans the data for img tags and replaces them with yahoo nsml standard media tags if the img does not exist at the src, it will be removed <img src="filename.jpg" alt="#alt_text#" align="#alignment#" width="#w#" height="#h#"/> <media media-type="image" style="align:#alignment#"> <media-reference mime-type="" source="#photo_number"/> </media> @param data_html data in html format @param source source of the image, should be the publication or the account @param check_exists if True, will check if file exists and update the tags, else remove the instance @return data with img tags replace by media tags """ # use this with the following template tag if required # {{ entry.get_body_with_yahoo_media_tags|xmlsafe|hreftext|safe }} # remember TODO, remove nesting of img tags inside para tags soup = bsoup(data_html) # remove a href to images!! image_count = 0 for image in soup.findAll("img"): new_img_tag = '' src = '' try: # extract the elements from img tag if image.has_key("src"): src = image["src"] filename = src.split("/")[-1] type, mime = get_media_mime_type(filename) new_img_tag = """<media style="rightSide" media-type="image">""" + \ """<media-reference source="#photo%s" mime-type=""/></media>""" % \ image_count image_count += 1 except ContifyValidationError, e: # move on to the next image, catch the exception - log it and move on logger.error(e) finally:
def order(self,data1,data2): s = requests.Session() url1 = 'http://login.qyer.com/qcross/login/auth.php?action=login' r1 = s.post(url1,data=data1,headers=headers) url2 = r1.json()['data']['arr_synlogin'][2] r2 = s.post(url2,headers=headers) # cookies = r1.cookies+r2.cookies print r2.cookies url3 = 'http://z.qyer.com/orderformconfirm' r3 = s.post(url3, data2,cookies=r2.cookies,headers=headers) print "================" soup = bsoup(r3.text) token = soup.findAll("input")[3]["value"] data_order['form_token'] = token # ordercookie = self.dealcookies(r1.cookies,r2.cookies, r3.cookies) # print ordercookie url4 = 'http://z.qyer.com/orderform' headers['Referer'] = 'http://z.qyer.com/orderformconfirm' submit = s.post(url4,data = data_order,cookies=r3.cookies,headers=headers )
def setUp(self): self.test = file("backend/test/data/inventory/test_20120310_055847_saholic.html", "r").read() self.test_data = str(bsoup(self.test).fetch('div', 'productItem')[0]) self.inventory = SaholicInventory(self.test_data) self.attr = attribute() self.item = { self.attr.name : u'Alcatel OT-230D', self.attr.id : md5( 'SHOLIC_Alcatel OT-230D' ).hexdigest(), self.attr.url : u'http://saholic.com/mobile-phones/alcatel-ot-230d-1001720', self.attr.specs : None, self.attr.color : None, self.attr.brand : None, self.attr.stock : None, self.attr.source: u'SHOLIC', self.attr.price : u'949', self.attr.image : u"http://static2.saholic.com/images/media/1001720/alcatel-ot-230d-icon-1313564847734.jpg", self.attr.delivery : None }
def setUp(self): self.test = file("backend/test/data/inventory/test_20120310_055847_flipkart.html", "r").read() self.test_data = str(bsoup(self.test).fetch('div', 'fk-srch-item')[0]) self.inventory = FlipkartInventory(self.test_data) self.attr = attribute() self.item = { self.attr.name : u'Samsung Galaxy Y S5360', self.attr.color : u'Grey', self.attr.specs : u'Android v2.3 OS, 2 MP Primary Camera, 3-inch Touchscreen, FM Radio', self.attr.stock : u'In Stock.', self.attr.price : u'7650', self.attr.image : u'http://img1.flixcart.com//image/mobile/4/4/4/samsung-galaxy-y-s5360-125x125-imad2pzjx3uq8paz.jpeg', self.attr.brand : None, self.attr.delivery : u'2-4 business days. Free Home Delivery.', self.attr.source : u'FKART', self.attr.url : u'http://flipkart.com//samsung-galaxy-y-s5360-mobile-phone/p/itmd2pz2rpcg5smz/search-mobile-/1?pid=mobd2pyzfanvw444&ref=c337db2d-b97a-4b4b-9061-bf3705435edd&_l=HmmZvbFeU9Oo4NUBP6Fi6Q--&_r=t2xsnCM8eE1pqUPoLth04Q--', self.attr.id : md5( 'FKART_Samsung Galaxy Y S5360' ).hexdigest() }
def setUp(self): self.test = file("backend/test/data/inventory/test_20120310_055847_infibeam.html", "r").read() self.test_data = str(bsoup(self.test).fetch('ul', 'srch_result portrait')[0].fetch('li')[0]) #monkey patching test-data to get the correct minimal test-data self.test_data = str("<ul class='srch_result portrait'>" + self.test_data + "</ul>") #inventory object. self.inventory = InfibeamInventory(self.test_data) #inventory item to be tested against. self.attr = attribute() self.item = { self.attr.id : md5( 'IBEAM_Sony Ericsson XPERIA X2 (Black)' ).hexdigest(), self.attr.url : "http://infibeam.com/Mobiles/i-Sony-Ericsson-XPERIA-X2-Slider/P-E-M-Sony-Ericsson-XPERIAX2.html?id=Black", self.attr.name : u'Sony Ericsson XPERIA X2 (Black)', self.attr.color : None, self.attr.specs : None, self.attr.stock : None, self.attr.brand : None, self.attr.price : u'25767', self.attr.source: u'IBEAM', self.attr.delivery : None, self.attr.image : u'http://cdn-img-a.infibeam.net/img/2ffd0b46/80/22/p-e-m-sony-ericsson-xperiax2-front-1.wm.jpg?op_sharpen=1&wid=120&hei=140'}
def parseWikiContent(text): soup = bsoup(text) # check exists noarticle = soup('div', {"class" : "noarticletext"}) if len(noarticle) != 0: print 'not exist!!!' return None pSet = soup('div', {'id' : 'mw-content-text'})[0].findChildren('p', recursive=False) loops = 3 contents = '' for p in pSet: if loops == 0: break #print p content = p.getText() #print content if len(content) >= 4 and content[0:6].find(u'坐标') == -1: content = filterInvalidChar(pattern, content) contents += content.encode('utf-8') + '\n' loops -= 1 if len(contents) > 0: return contents else: return None
def get_salt_version(url, args, prev_branch=False): if 'osx' not in url and prev_branch: url = url.replace(url.split('/')[-1], args) get_url = requests.get(url) ret_code = get_url.status_code if ret_code != 200: print('Attempt to query url failed with http error code: {0}'.format(ret_code)) sys.exit(1) html = get_url.content parse_html = bsoup(html) pkg_name = 'salt-master' if 'osx' in url: try: pkg_name = 'salt-{0}'.format(args.branch) except AttributeError: pkg_name = 'salt-{0}'.format(args) if 'windows' in url: pkg_name = 'Salt-Minion-{0}'.format(args.branch) for tag in parse_html.findAll(attrs={'href': re.compile(pkg_name + ".*")}): match = re.search("([0-9]{1,4}\.)([0-9]{1,2}\.)([0-9]{1,2})", str(tag)) salt_ver = (match.group(0)) return salt_ver
def img_to_media(data_html, producer, prefix=None, download_to=None): """ TODO: * Height and width being ignored * All a href to images being ignored * Image Caption - if image within divs is not being considered scans the data for img tags and replaces them with nsml standard media tags if the img does not exist at the src, it will be removed files will be downloaded if the location is specified <img src="filename.jpg" alt="#alt_text#" align="#alignment#" width="#w#" height="#h#"/> <media media-type="image" style="align:#alignment#"> <media-reference mime-type="image/jpeg" source="filename.jpg" alternate-text="#alt-text#" height="#h#" width="#w#"></media-reference> <media-caption> #caption# </media-caption> <media-producer> source </media-producer> </media> @param data_html data in html format @param source source of the image, should be the publication or the account @param check_exists if True, will check if file exists and update the tags, else remove the instance @param download_to location where the files needs to be downloaded, optional @return data with img tags replace by media tags """ soup = bsoup(data_html) src_list = [] # remove a href to images!! for image in soup.findAll("img"): new_img_tag = '' src = '' try: # extract the elements from img tag if image.has_key("src"): src = image["src"] filename = src.split("/")[-1] if prefix: filename = prefix + "_" + filename alt = "" # default to blank if image.has_key("alt"): alt = image["alt"] align = 'align:right' # default to right if image.has_key("align"): align = image["align"] if image.has_key("width") and image.has_key("height"): width = image["width"] height = image["height"] new_img_tag = get_media_tags(filename, producer, alt, height=height, width=width, align=align) else: new_img_tag = get_media_tags(filename, producer, alt, align) # ignore height and width for now new_img_tag = get_media_tags(filename, producer, alt, align) else: # error - src has to be there!! raise ContifyValidationError("Image src missing, img tag: %s" % (image)) except ContifyValidationError, e: # move on to the next image, catch the exception - log it and move on logger.error(e) finally:
def rd_parse_post(entry): blogger_id = entry.id created = entry.published.split('.')[:1][0].replace('T', ' ') updated = entry.updated.split('.')[:1][0].replace('T', ' ') link = entry.link #[-1] url = link.replace('http://rugbydump.blogspot.com', '') title = entry.title.encode('ASCII', 'ignore') content = entry.summary content = renode.sub(node, content).encode('ASCII', 'ignore') # Fix up content a bit xcontent = bsoup(content) img = xcontent.img src = img['src'].split('/')[-1] img['src'] = '/media/posts/' + src img['alt'] = title del (img['border']) del (img['style']) del (img['id']) # Put a centererd paragraph around the image np = Tag(xcontent, 'p', [('style', 'text-align: center;')]) np.insert(0, img) try: xcontent.a.replaceWith( np) # Takes away the link around the first image except: xcontent.insert( 0, np ) # Lol that was pretty important (just inserts it and the blank link will remain unfortunately) # Remove the last div xcontent.findAll('div', attrs={'class': 'blogger-post-footer'})[0].extract() try: blurb = xcontent.span.contents[0] except: blurb = '' content = xcontent.prettify() try: numcomments = entry.thr_total except AttributeError: numcomments = 0 try: return { 'src': src, 'created': created, 'updated': updated, 'url': url, 'numcomments': numcomments, 'blogger_id': blogger_id, 'title': title, 'blurb': blurb, 'content': content, } except UnicodeDecodeError: print "Skipping post \"%s\".." % title return
'medium': common.sesame_base_url + video['thumbnailLarge'], 'large': common.sesame_base_url + video['poster'] }, 'cast': str(video['character']).split(';'), 'width': float(video['width']), 'height': float(video['height']), } menu.addVideoItem(item, video) items.append(item) settings.setobj('temp video list', items) ok = True page = common.args.get('page', None) if page == 'topics': html = utils.getHTML('videos') lis = bsoup(html).find('select', {'class':re.compile("filter-topic")}).findAll('option') for item in lis: if item['value'] == '': continue menu.addFolderItem(item.string, {'page':'list_vids','reset':1,'topic':int(item['value'])}) elif page == 'recent': videos = fetch_vids(reset=True) list_vids(videos) menu.moreVideosBtn() elif page == 'muppets': # get JSON-formatted names html = utils.getHTML('ump-portlet/js/sw/sw.ump.js') match = re.findall("muppets\s+:\s+\[([\s\"a-zA-Z\|\,]+)\]", html) match = re.findall("\"([a-zA-Z\s\|]+)\"", match[0])
def ssdut_news_parse(raw): ''' parse the raw page src, store all result in a Storage object. all strings are unicode result.soup BeautifulSoup object result.raw raw page src result.hash sha1 hash of the page result.title title result.source 来源 result.date_str - date in string result.date - date object result.body html src of the news body result.clean_body unescaped src of the news body, result.publisher 发表人 ''' soup = bsoup(raw) result = Storage() # raw page / hash result.raw = raw result.soup = soup # title s = soup.find(attrs={'class': re_compile('title')}) result.title = s.text # source text = soup.find(text=re_compile(r"^http://ssdut.dlut.edu.cn")) r = re_compile(ur"(\d+-\d+-\d+)\u3000\u3000\u6765\u6e90:(.+)\u5173\u6ce8:") res = r.findall(text)[0] result.source = res[1].rstrip() # date result.date_str = res[0] result.date = datetime.date(*[int(n) for n in result.date_str.split('-')]) # content (body) c = soup.find(attrs={'class': re_compile('content')}) result.body = unicode(c) # content (body) unescaped texts = c.findAll(text=True) all_texts = '\n'.join(texts) result.clean_body = html_parser.unescape(all_texts) # publisher (could be find at the bottom of page) s = soup.find( attrs={ "style": "font-size:14px;float:left;text-align:right;width:80%" }) r = re_compile(ur"\u53d1\u8868\u4eba\uff1a(.+)") #logging.debug("publisher string = %r " % s) try: name = r.findall(s.text)[0] except: logging.warn(" %s has no publisher " % result.title) name = "" # no publisher: like this: index.php/News/8692.html result.publisher = name.rstrip().lstrip() # use utf-8 encoding for k in ['title', 'source', 'body', 'clean_body', 'publisher']: result[k] = result[k].encode('utf-8') hash_src = result.body + result.title + result.publisher if isinstance(hash_src, str): hash_src = unicode(hash_src, "utf-8", "ignore") elif isinstance(hash_src, unicode): pass else: pass result.sha1 = sha1(hash_src.encode("utf-8")).hexdigest() result.search_text = ''.join([ result.title, result.source, result.clean_body, result.publisher, result.sha1 ]) return result
def ssdut_news_list(page_raw): ''' parse the news_list page, get a list of news, the same squence as the page, result.soup .page_no .news_list .total_records ''' result = Storage() soup = bsoup(page_raw) result.soup = soup # get current page number r = soup.find(text=ur"\u4e0b\u4e00\u9875") # text=u"下一页" if r: '''not the last page''' next_page_link = r.parent.attrs[0][1] #logging.debug("r.parent.attrs = %r" % r.parent.attrs) r = re_compile(r'/p/(\d+)') page_no = r.search(next_page_link).group(1) page_no = int(page_no) # - 1 else: ''' the last page''' r = soup.find(text=ur'\u4e0a\u4e00\u9875') prev_page_link = r.parent.attrs[0][1] #logging.debug("r.parent.attrs = %r" % r.parent.attrs) r = re_compile(r'/p/(\d+)') page_no = r.search(prev_page_link).group(1) page_no = int(page_no) # + 1 result.page_no = page_no # get the news list res = soup.findAll(attrs={"bgcolor": "#EEEEEE"}) news_list = [] counter = 1 for r in res: a = r.findChildren("a") date_str = r.find(text=re_compile("\d{4}-\d{2}-\d{2}")).encode("utf-8") news_list.append({ "link": a[0].get("href").encode("utf-8"), "title": a[0].text.encode("utf-8"), "source": a[1].text.encode("utf-8"), "source_link": a[1].get("href").encode("utf-8"), "date_str": date_str, "date": datetime.date(*[int(n) for n in date_str.split("-")]), "no": counter, }) counter += 1 #logging.debug("source = %s, source_link = %s" % # (news_list[-1]['source'], news_list[-1]['source_link'])) result.news_list = news_list # tital news num # 共\d+ t条记录 s = soup.find(text=re_compile(ur"\u5171\d+ \u6761\u8bb0\u5f55")) r = re_compile(ur"\u5171(\d+)") result.total_records = int(r.search(s).group(1)) return result
"HEALTH_AND_FITNESS", "HOUSE_AND_HOME", "LIFESTYLE", "MAPS_AND_NAVIGATION", "MEDICAL", "MUSIC_AND_AUDIO", "NEWS_AND_MAGAZINES", "PARENTING", "PERSONALIZATION", "PHOTOGRAPHY", "PRODUCTIVITY", "SHOPPING", "SOCIAL", "SPORTS", "TOOLS", "TRAVEL_AND_LOCAL", "VIDEO_PLAYERS", "WEATHER", "LIBRARIES_AND_DEMO", "GAME_ARCADE", "GAME_PUZZLE", "GAME_CARD", "GAME_CASUAL", "GAME_RACING", "GAME_SPORTS", "GAME_ACTION", "GAME_ADVENTURE", "GAME_BOARD", "GAME_CASINO", "GAME_EDUCATIONAL", "GAME_MUSIC", "GAME_ROLE_PLAYING", "GAME_SIMULATION", "GAME_STRATEGY", "GAME_TRIVIA", "GAME_WORD", "ANDROID_WEAR" ] for i in range(0, len(category)): url = "https://play.google.com/store/apps/category/" + category[ i] + "/collection/topselling_free?hl=ja" html = requests.get(url).text soup = bsoup(html) urlslist = soup.findAll("a", {"class": "card-click-target"}) urls = [] #open the file to keep the list, as required filename = url[44:-33] + ".txt" fo = open(filename, 'w') #Url list for a in urlslist: link = "https://play.google.com" + a['href'] #print(link) urls.append(link) url = urls[::4] for item in url: item = item[46:] #list as package name
def openSoup(x): """Opens URL and create soup""" return bsoup(urllib2.urlopen(x).read())
def FakeResponse(a): test = file("backend/test/data/inventory/test_20120310_055847_flipkart.html", "r").read() test_data = str(bsoup(test).fetch('div', 'fk-srch-item')) return '200 OK', test_data
#coding: utf-8 #import urllib2, sys #import xml.etree.ElementTree as etree from BeautifulSoup import BeautifulSoup as bsoup f = open('links.txt','w') links = [] #url = "http://gyokai-search.com/2nd-genre.htm" url = "http://cooksonia.6.ql.bz/test/gyokai.html" html = urllib2.urlopen(url).read() soup = bsoup(html) links = soup.findAll('a') for link in links: str_ = dict(link.attrs)['href'] f.write(str_+"\n") f.close() """ for url in links: html = urllib2.urlopen(url).read() soup = bsoup(html) h1 = soup.find('h1', attrs={"class":"yjXL"}).contents text = soup.find('p', attrs={"class":"ynDetailText"}).contents f = open('./rss/'+str(h1[0])+'.txt','w') for t in text: f.write(str(t)) f.close() """
def ssdut_news_list(page_raw): ''' parse the news_list page, get a list of news, the same squence as the page, result.soup .page_no .news_list .total_records ''' result = Storage() soup = bsoup(page_raw) result.soup = soup # get current page number r = soup.find(text=ur"\u4e0b\u4e00\u9875") # text=u"下一页" if r: '''not the last page''' next_page_link = r.parent.attrs[0][1] #logging.debug("r.parent.attrs = %r" % r.parent.attrs) r = re_compile(r'/p/(\d+)') page_no = r.search(next_page_link).group(1) page_no = int(page_no) # - 1 else: ''' the last page''' r = soup.find(text=ur'\u4e0a\u4e00\u9875') prev_page_link = r.parent.attrs[0][1] #logging.debug("r.parent.attrs = %r" % r.parent.attrs) r = re_compile(r'/p/(\d+)') page_no = r.search(prev_page_link).group(1) page_no = int(page_no) # + 1 result.page_no = page_no # get the news list res = soup.findAll(attrs={"bgcolor": "#EEEEEE"}) news_list = [] counter = 1 for r in res: a = r.findChildren("a") date_str = r.find(text=re_compile("\d{4}-\d{2}-\d{2}")).encode("utf-8") news_list.append( { "link": a[0].get("href").encode("utf-8"), "title": a[0].text.encode("utf-8"), "source": a[1].text.encode("utf-8"), "source_link": a[1].get("href").encode("utf-8"), "date_str": date_str, "date": datetime.date( *[int(n) for n in date_str.split("-")]), "no": counter, }) counter += 1 #logging.debug("source = %s, source_link = %s" % # (news_list[-1]['source'], news_list[-1]['source_link'])) result.news_list = news_list # tital news num # 共\d+ t条记录 s = soup.find(text=re_compile(ur"\u5171\d+ \u6761\u8bb0\u5f55")) r = re_compile(ur"\u5171(\d+)") result.total_records = int(r.search(s).group(1)) return result
u = urllib.urlopen(link) torrent = u.read() f = file(path, 'wb') f.write(torrent) f.close() return True def list_shows(): result = [] try: inputFP = urllib.urlopen(BASE_URL) except Exception, e: print e return result for option in bsoup(inputFP)('option'): optContents = ''.join(option.contents) optValue = option['value'] if optValue.isnumeric(): result.append({optContents: optValue}) inputFP.close() return result def is_in_cache(cache, key, element): if cache.has_key(key): return element in cache[key] return False def store_in_cache(cache, key, element):
def ssdut_news_parse(raw): ''' parse the raw page src, store all result in a Storage object. all strings are unicode result.soup BeautifulSoup object result.raw raw page src result.hash sha1 hash of the page result.title title result.source 来源 result.date_str - date in string result.date - date object result.body html src of the news body result.clean_body unescaped src of the news body, result.publisher 发表人 ''' soup = bsoup(raw) result = Storage() # raw page / hash result.raw = raw result.soup = soup # title s = soup.find(attrs={'class': re_compile('title')}) result.title = s.text # source text = soup.find(text=re_compile(r"^http://ssdut.dlut.edu.cn")) r = re_compile( ur"(\d+-\d+-\d+)\u3000\u3000\u6765\u6e90:(.+)\u5173\u6ce8:") res = r.findall(text)[0] result.source = res[1].rstrip() # date result.date_str = res[0] result.date = datetime.date(*[int(n) for n in result.date_str.split('-')]) # content (body) c = soup.find(attrs={'class': re_compile('content')}) result.body = unicode(c) # content (body) unescaped texts = c.findAll(text=True) all_texts = '\n'.join(texts) result.clean_body = html_parser.unescape(all_texts) # publisher (could be find at the bottom of page) s = soup.find( attrs={ "style": "font-size:14px;float:left;text-align:right;width:80%" }) r = re_compile(ur"\u53d1\u8868\u4eba\uff1a(.+)") #logging.debug("publisher string = %r " % s) try: name = r.findall(s.text)[0] except: logging.warn(" %s has no publisher " % result.title) name = "" # no publisher: like this: index.php/News/8692.html result.publisher = name.rstrip().lstrip() # use utf-8 encoding for k in ['title', 'source', 'body', 'clean_body', 'publisher']: result[k] = result[k].encode('utf-8') hash_src = result.body + result.title + result.publisher if isinstance(hash_src, str): hash_src = unicode(hash_src, "utf-8", "ignore") elif isinstance(hash_src, unicode): pass else: pass result.sha1 = sha1(hash_src.encode("utf-8")).hexdigest() result.search_text = ''.join([result.title, result.source, result.clean_body, result.publisher, result.sha1]) return result