def parse_element(script): content = None content_length = 0 uns = unhexlify(script) mark = uns[0] if 0x00 <= mark and mark <= 0x4B: content_length = mark content = script[2:2 + content_length * 2] script = script[2 + content_length * 2:] elif 0x4C <= mark and mark <= 0x4E: if 0x4C == mark: content_length = int(script[2:4], 16) content = script[4:4 + content_length * 2] script = script[4 + content_length * 2:] if 0x4D == mark: content_length = int(CT.big_or_little(script[2:6]), 16) content = script[6:6 + content_length * 2] script = script[6 + content_length * 2:] if 0x4E == mark: content_length = int(CT.big_or_little(script[2:10]), 16) content = script[10:10 + content_length * 2] script = script[10 + content_length * 2:] elif 0x4F == mark: content = -1 script = script[2:] elif 0x51 <= mark and mark <= 0x60: content = mark - 0x50 script = script[2:] else: pass return content, script
def solveReviewerBack(self): # newAsinList = [] with open("../data/" + self.fetchDate.isoformat() + "/reviewer.txt", "w") as fout: ct = CommonTool(fout) ct.writeln(Reviewer.Reviewer.tableHead) for i, reviewerID in enumerate(self.reviewerList): if reviewerID == '': continue print 'solve reviewer {0} of {1}: {2}'.format( i, len(self.reviewerList), reviewerID) reviewer = Reviewer.loadReviewer(reviewerID) # newAsinList.extend(reviewer.getReviewedBookList()) reviewer.saveProfileUp(ct)
def printReviews(self, ct=None): fout = '' flag = False if ct is None: flag = 1 if not os.path.exists("../data/" + self.fetchDate.isoformat() + "/review/"): os.makedirs( "../data/" + self.fetchDate.isoformat() + "/review/") ct = CommonTool() if not os.path.exists("../data/" + self.fetchDate.isoformat() + "/review/review.txt"): fout = open( "../data/" + self.fetchDate.isoformat() + "/review/review.txt", "w") print "writing reviews in new file" ct.setFout(fout) ct.writeln(Review.Review.tableHead) else: fout = open( "../data/" + self.fetchDate.isoformat() + "/review/review.txt", "a") ct.setFout(fout) for reviewID in self.reviewList: print "printing reviews" review = Review.loadReview(reviewID, self.fetchDate) review.printData(ct) review.insertReviewDataIntoTable() if flag: fout.close()
def solveRightCol(self): try: divRightCol = self.html.xpath(".//div[@id='rightCol']")[0] divBuyBoxInner = divRightCol.xpath(".//div[@id='buyBoxInner']")[0] priceSpan = divBuyBoxInner.xpath("./div/div[2]/ul/li/span") if len(priceSpan) == 0: self.listPrice = 0 else: priceType = priceSpan[0].xpath("./span")[0].text.strip() priceValue = priceSpan[0].xpath("./span")[1].text.strip() if priceType == 'List Price:': self.listPrice = CommonTool.strToFloat(priceValue) else: self.listPrice = 0 del priceType del priceValue del priceSpan del divBuyBoxInner del divRightCol except Exception, e: print 'RightCol error: %s' % str(e) self.listPrice = 0
def getProductLinksUsingThreads(self,productList): print "getting products using thread method" for link in productList: linkPage = MyHtml.getHtml( link, self.rID + "_AllProductLinks_" + str(self.page)) trObjs = linkPage.xpath( ".//body/table[2]/tr[1]/td[2]/table[2]/tr[@valign='top']") if link is self.firstLink: try: firstTimeObj = trObjs[-1] except LookupError: print link exit(0) fTime = firstTimeObj.xpath("./following-sibling::*") if len(fTime) == 0: self.fRevTime = 'N/A' else: # print etree.tostring(fTime[0]) fTime = fTime[0].xpath(".//nobr") if len(fTime) == 0: self.fRevTime = 'N/A' else: self.fRevTime = fTime[0].text.strip() self.fRevTime = CommonTool.strToDate(self.fRevTime) del fTime if link is self.lastLink: lastTimeObj = trObjs[0] lTime = lastTimeObj.xpath("./following-sibling::*") if len(lTime) == 0: self.lRevTime = 'N/A' else: # print etree.tostring(lTime[0]) lTime = lTime[0].xpath(".//nobr") if len(lTime) == 0: self.lRevTime = 'N/A' else: self.lRevTime = lTime[0].text.strip() self.lRevTime = CommonTool.strToDate(self.lRevTime) del lTime for trObj in trObjs: tableObj = trObj.xpath( "./td[@class='small'][3]/table[@class='small']") # aLink if len(tableObj) != 0: aLink = tableObj[0].xpath(".//a") if len(aLink) == 0: aLink = '' else: aLink = aLink[0].attrib['href'] # rate rateObj = trObj.xpath("./following-sibling::*") rate = 'N/A' if len(rateObj) != 0: try: rateObj1 = rateObj[0].xpath(".//img") title = rateObj1[0].attrib['title'] rate = title.split("out")[0].strip() self.sum = self.sum + float(rate) self.counter = self.counter + 1 except Exception, e: sys.stderr.write(str(e) + ' rate Exception\n') # reviewID reviewID = '' rIDObj = rateObj[0].xpath(".//a") if len(rIDObj) != 0: reviewID = rIDObj[0].attrib['name'] # label The review is from label = '' labelObj = rateObj[0].xpath(".//div[@class='tiny']") if len(labelObj) != 0: # verified purchase + the review is from aObj = labelObj[-1].xpath(".//a") if len(aObj) != 0: label = filtTag.filter_tags( etree.tostring(aObj[0]).strip()) # 1---book 0---product parLeft = label.find('(') parRight = label.find(')') if parLeft == -1 and parRight == -1: label = '0' elif label[-1] == ')': label = label.split('(') label = label[-1][:-1] if (label.find('Paperback') != -1) or (label.find('Hardcover') != -1): label = '1' bookID = aLink.replace('/ref=cm_cr-mr-title', '') bookID = bookID[-10:] self.reviewedBookList.append(bookID) productList = [] productList.append(aLink) productList.append(rate) productList.append(reviewID) productList.append(label) self.allProductLinks.append(productList) del productList self.page = self.page + 1
async def crawl(self): self.start = await self.get_state() self.start += 1 while True: current_height = await self.get_block_count() time_a = CT.now() if self.start < current_height: stop = self.start + self.max_tasks if stop >= current_height: stop = current_height self.processing.extend([i for i in range(self.start, stop)]) max_height = max(self.processing) min_height = self.processing[0] await asyncio.wait( [self.cache_block(h) for h in self.processing]) if self.processing != sorted(self.cache.keys()): msg = 'cache != processing' logger.error(msg) raise Exception(msg) sys.exit(1) await self.update_sys_fee(min_height) vins = [] vouts = [] claims = [] for block in self.cache.values(): for tx in block['tx']: txid = tx['txid'] height = block['index'] for vin in tx['vin']: vins.append([vin, txid, height]) for vout in tx['vout']: vouts.append([vout, txid, height]) if 'claims' in tx.keys(): for claim in tx['claims']: claims.append([claim, txid, height]) if vins: await asyncio.wait( [self.update_a_vin(*vin) for vin in vins]) if vouts: await asyncio.wait( [self.update_a_vout(*vout) for vout in vouts]) if claims: await asyncio.wait( [self.update_a_claim(*claim) for claim in claims]) #cache update addresses if stop == current_height and 1 == len(self.processing): uas = [] vinas = await asyncio.gather( *[self.get_address_from_vin(vin[0]) for vin in vins]) voutas = [vout[0]['address'] for vout in vouts] uas = list(set(vinas + voutas)) await self.update_addresses(max_height, uas) time_b = CT.now() logger.info( 'reached %s ,cost %.6fs to sync %s blocks ,total cost: %.6fs' % (max_height, time_b - time_a, stop - self.start, time_b - START_TIME)) await asyncio.wait([ self.update_block(block) for block in self.cache.values() ]) await self.update_state(max_height) self.start = max_height + 1 del self.processing del self.cache self.processing = [] self.cache = {} else: await asyncio.sleep(0.5)
% (max_height, time_b - time_a, stop - self.start, time_b - START_TIME)) await asyncio.wait([ self.update_block(block) for block in self.cache.values() ]) await self.update_state(max_height) self.start = max_height + 1 del self.processing del self.cache self.processing = [] self.cache = {} else: await asyncio.sleep(0.5) if __name__ == "__main__": START_TIME = CT.now() logger.info('STARTING...') mongo_uri = C.get_mongo_uri() neo_uri = C.get_neo_uri() mongo_db = C.get_mongo_db() tasks = C.get_tasks() loop = asyncio.get_event_loop() crawler = Crawler(mongo_uri, mongo_db, neo_uri, loop, tasks) try: loop.run_until_complete(crawler.crawl()) except Exception as e: logger.error('LOOP EXCEPTION: %s' % e) finally: loop.close()
async def crawl(self): self.start = await self.get_history_state() self.start += 1 while True: current_height = await self.get_block_count() time_a = CT.now() if self.start < current_height: stop = self.start + self.max_tasks if stop >= current_height: stop = current_height self.processing.extend([i for i in range(self.start, stop)]) max_height = max(self.processing) min_height = self.processing[0] await asyncio.wait( [self.cache_block(h) for h in self.processing]) if self.processing != sorted(self.cache.keys()): msg = 'cache != processing' logger.error(msg) sys.exit(1) txids = [] for block in self.cache.values(): for tx in block['tx']: for vin in tx['vin']: txids.append(vin['txid']) txids = list(set(txids)) if txids: await asyncio.wait( [self.cache_utxo_vouts(txid) for txid in txids]) if sorted(txids) != sorted(self.cache_utxo.keys()): msg = 'cache utxo error' logger.error(msg) sys.exit(1) vins = [] vouts = [] for block in self.cache.values(): block_time = block['time'] for tx in block['tx']: utxo_dict = {} for vin in tx['vin']: utxo = self.cache_utxo[vin['txid']][vin['vout']] key = utxo['asset'] + '_' + utxo['address'] if key in utxo_dict.keys(): utxo_dict[key]['value'] = CT.sci_to_str( str( D(utxo_dict[key]['value']) + D(utxo['value']))) else: utxo_dict[key] = utxo vout_dict = {} for vout in tx['vout']: key = vout['asset'] + '_' + vout['address'] if key in vout_dict.keys(): vout_dict[key]['value'] = CT.sci_to_str( str( D(vout_dict[key]['value']) + D(vout['value']))) else: vout_dict[key] = vout if 1 == len(utxo_dict) == len( vout_dict) and utxo_dict.keys( ) == vout_dict.keys(): key = list(utxo_dict.keys())[0] if utxo_dict[key]['value'] == vout_dict[key][ 'value']: continue utxos = list(utxo_dict.values()) for i in range(len(utxos)): utxo = utxos[i] key = utxo['asset'] + '_' + utxo['address'] if key in vout_dict.keys(): if D(utxo['value']) > D( vout_dict[key]['value']): utxo['value'] = CT.sci_to_str( str( D(utxo['value']) - D(vout_dict[key]['value']))) del vout_dict[key] vins.append([utxo, tx['txid'], i, block_time]) voutx = list(vout_dict.values()) for k in range(len(voutx)): vout = voutx[k] vouts.append([vout, tx['txid'], k, block_time]) if vins: await asyncio.wait( [self.update_a_vin(*vin) for vin in vins]) if vouts: await asyncio.wait( [self.update_a_vout(*vout) for vout in vouts]) time_b = CT.now() logger.info( 'reached %s ,cost %.6fs to sync %s blocks ,total cost: %.6fs' % (max_height, time_b - time_a, stop - self.start, time_b - START_TIME)) await self.update_history_state(max_height) self.start = max_height + 1 del self.processing del self.cache del self.cache_utxo self.processing = [] self.cache = {} self.cache_utxo = {} else: await asyncio.sleep(0.5)
def solveReviewPage(asin, rank, url, fetchDate, bookPublishDate): hlre = re.compile( r'^(\d+) of (\d+) people found the following review helpful') html = MyHtml.getHtml(url) print "solving Review Page" countOfReviews=int(html.xpath('.//div[@id="cm_cr-product_info"]/div/div[1]/div[2]/span')[0].text.strip()) if countOfReviews>0: divWholeReviewList = html.xpath('.//div[@id="cm_cr-review_list"]')[0] divReviewList = divWholeReviewList.xpath('./div[@id]') print divReviewList for divReview in divReviewList: aReview = Review() rank += 1 aReview.helpfulRank = rank aReview.asin = asin aReview.reviewID = divReview.attrib['id'] # helpful line parentNode=divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]') print "helpful Match",parentNode helpfulMatch=None if parentNode is not None: match1 = divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]') if match1 is not None and len(match1)>0: if match1[0].text is not None: helpfulMatch=hlre.match(match1[0].text.strip()) else: match2=divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]/span[1]') if match2 is not None and len(match2)>0: if match2[0].text is not None: helpfulMatch=hlre.match(match2[0].text.strip()) print helpfulMatch if helpfulMatch: aReview.helpful = int(helpfulMatch.group(1)) aReview.total = int(helpfulMatch.group(2)) aReview.helpfulness = aReview.helpful * \ 100 / aReview.total / 100.0 else: aReview.helpful=0 aReview.total=0 aReview.helpfulness=0 del helpfulMatch print "getting rate data" rateData = divReview.xpath('.//span[@class="a-icon-alt"]')[0].text.strip() aReview.rate=rateData.split(' ')[0].strip() aReview.title = divReview.xpath('.//a[@class="a-size-base a-link-normal review-title a-color-base a-text-bold"]')[0].text.strip() print "getting reviewerId" reviewer = divReview.xpath('.//span[@class="a-size-base a-color-secondary review-byline"]/a') if reviewer is not None and len(reviewer)>0: aReview.reviewerID=reviewer[0].attrib['href'].split('/')[4].split('?')[0] print "reviewerId",aReview.reviewerID aReview.date = CommonTool.strToDate(divReview.xpath('./div[@class="a-row"]/span[4]')[0].text.strip()) aReview.elapsedDate = (fetchDate - aReview.date).days print bookPublishDate if bookPublishDate=='N/A': aReview.reviewBookDate='N/A' else: aReview.reviewBookDate = (aReview.date - (bookPublishDate)).days # format line try: strFormat = divReview[3].xpath('./a[1]')[0].text.strip() aReview.fromFormat = strFormat.split(' ')[1] except IndexError: aReview.fromFormat = '' spanVerifiedPurchase = divReview.xpath('.//span[@class="a-size-mini a-color-state a-text-bold"]') if spanVerifiedPurchase: spanVerifiedPurchase=spanVerifiedPurchase[0].text.strip() if spanVerifiedPurchase=="Verified Purchase": aReview.verified = 1 else: aReview.verified = 0 # review text line divReviewText = divReview.xpath('.//div[@class="a-row review-data"]/span')[0] aReview.description = filtTag.filter_tags( etree.tostring(divReviewText).strip()).strip() aReview.description = aReview.description.replace('\n', '<br />') del divReviewText # review comments line aReview.numOfComments = CommonTool.strToInt( divReview.xpath('.//div[@class="a-row a-spacing-top-small review-comments"]/div/a/span/span[1]')[0].text.strip()) aReview.getComments() saveReview(review=aReview) # try: # reviewer = Reviewer.loadReviewer(aReview.reviewerID) # aReview.lastReviewRank = reviewer.getPreBookReviewRanking( # aReview.reviewID) # except Exception, e: # sys.stderr.write(str(e) + '\n') # sys.stderr.write('lastReviewRank not found! url: {0} id: {1} \ # reviewerID: {2}\n'.format( # url, aReview.reviewID, aReview.reviewerID)) # import traceback # traceback.print_exc() # end of for # end of else return rank
async def crawl(self): self.start = await self.get_asset_state() self.start += 1 while True: current_height = await self.get_block_count() time_a = CT.now() if self.start < current_height: stop = self.start + self.max_tasks if stop >= current_height: stop = current_height self.processing.extend([i for i in range(self.start, stop)]) max_height = max(self.processing) min_height = self.processing[0] await asyncio.wait( [self.cache_block(h) for h in self.processing]) if self.processing != sorted(self.cache.keys()): msg = 'cache != processing' logger.error(msg) sys.exit(1) global_assets = {} nep5_assets = {} for block in self.cache.values(): block_time = block['time'] for tx in block['tx']: if 'RegisterTransaction' == tx['type']: global_assets[tx['txid']] = tx['asset'] global_assets[tx['txid']]['time'] = block_time if 'InvocationTransaction' == tx[ 'type'] and 490 <= int(float(tx['sys_fee'])): if tx['script'].endswith( '68134e656f2e436f6e74726163742e437265617465' ): try: asset = self.parse_script(tx['script']) except Exception as e: print('parse error:', e) continue asset['time'] = block_time nep5_assets[asset['contract']] = asset if global_assets: await asyncio.wait([ self.update_a_global_asset(*i) for i in global_assets.items() ]) if nep5_assets: await asyncio.wait([ self.update_a_nep5_asset(*i) for i in nep5_assets.items() ]) time_b = CT.now() logger.info( 'reached %s ,cost %.6fs to sync %s blocks ,total cost: %.6fs' % (max_height, time_b - time_a, stop - self.start, time_b - START_TIME)) await self.update_asset_state(max_height) self.start = max_height + 1 del self.processing del self.cache self.processing = [] self.cache = {} else: await asyncio.sleep(0.5)
self.paperbackPrice = 0 priceList = divCenterCol.find(".//div[@id='tmmSwatches']/ul") if priceList is None: priceList = divCenterCol.xpath( ".//div[@id='twister']/div/span[@class='a-declarative']/table/tr\ ") for price in priceList: try: priceType = price.xpath( './td[@class="dp-title-col"]/*[@class="title-text"]/span\ ')[0].text.strip() priceValue = price.xpath( "./td[@class='a-text-right dp-price-col']//span")[0]\ .text.strip() if cmp(priceType, 'Kindle') == 0: self.kindlePrice = CommonTool.strToFloat(priceValue) elif cmp(priceType, 'Hardcover') == 0: self.hardcoverPrice = CommonTool.strToFloat(priceValue) elif cmp(priceType, 'Paperback') == 0: self.paperbackPrice = CommonTool.strToFloat(priceValue) except Exception, e: pass else: priceList = priceList.xpath(".//li") for priceLi in priceList: spans = priceLi.xpath("./span/span/span/a/span") priceType = spans[0].text.strip() priceValue = spans[1] if priceValue.find("./span") is not None: priceValue = priceValue.find("./span").text.strip() else:
def printData(self): with open("../data/" + self.fetchDate.isoformat() + "/dataAll.txt", "w") as fout: ct = CommonTool(fout) fout.write("\t".join(self.tableHeadList)) fout.write('\n') fout.flush() for i, reviewID in enumerate(self.reviewList): if reviewID == '': continue print 'solve review {0} of {1}: {2}'.format( i, len(self.reviewList), reviewID) review = Review.loadReview(reviewID) book = Book.loadBookByAsin(review.asin) reviewer = Reviewer.loadReviewer(review.reviewerID) ct.write(reviewID) ct.write(review.asin) ct.write(review.reviewerID) ct.write(reviewer.rName) ct.write(reviewer.tRev1) ct.write(reviewer.tRev10) ct.write(reviewer.tRev50) ct.write(reviewer.tRev100) ct.write(reviewer.tRev500) ct.write(reviewer.tRev1000) ct.write(reviewer.tRevHall) ct.write(reviewer.vVoice) ct.write(review.verified) ct.write(review.rate) ct.write(review.title) ct.write(review.date) ct.write(review.fetchDate) ct.write(review.reviewBookDate) ct.write(review.elapsedDate) ct.write(review.helpful) ct.write(review.total) ct.write(review.helpfulness) ct.write(review.helpfulRank) ct.write(review.timeRank) #ct.write(review.top1Percent) #ct.write(review.top5Percent) #ct.write(review.top10Percent) ct.write(review.description) ct.write(review.numOfComments) ct.write(review.comment) #ct.write(review.isQuoteTable) ct.write(review.lastReviewRank) ct.write(book.url) ct.write(book.tag) ct.write(book.allowPreview) ct.write(book.binding) ct.write(book.publishDate) ct.write(book.author) ct.write(book.authorInfo) ct.write(book.rate) ct.write(book.numOfReviews) ct.write(book.kindlePrice) ct.write(book.hardcoverPrice) ct.write(book.paperbackPrice) ct.write(book.bookDsc) ct.write(book.listPrice) ct.write(book.pages) ct.write(book.isbn10) ct.write(book.isbn13) ct.write(book.subrank) ct.write(book.hasEditorialReview) ct.write(book.editorialReview) #ct.write(book.hasQuoteTable) ct.write(reviewer.email) ct.write(reviewer.webPage) ct.write(reviewer.hasPhoto) ct.write(reviewer.rNum) ct.write(reviewer.helpRate) ct.write(reviewer.hVote) ct.write(reviewer.tVote) ct.write(reviewer.avgRate) ct.write(reviewer.fRevTime) ct.write(reviewer.lRevTime) ct.write(reviewer.duration) if reviewer.rReal == "N/A": ct.write(0) else: ct.write(1) if reviewer.location == "N/A": ct.write(0) else: ct.write(1) if reviewer.aboutMe == "N/A": ct.write(0) else: ct.write(1) if reviewer.interest == "N/A": ct.write(0) else: ct.write(1) ct.write(review.fromFormat) # if review.fromFormat == "Hardcover": # ct.write(0) # elif review.fromFormat == "Paperback": # ct.write(1) # else: # ct.write(2) if reviewer.rRank == "N/A": ct.write(0) else: ct.write(reviewer.rRank) ct.writeln(book.rank)
def script_to_hash(unhex): intermed = hashlib.sha256(unhex).digest() return CT.big_or_little( hexlify(hashlib.new('ripemd160', intermed).digest()).decode('ascii'))
def hex_to_num_str(cls, hs): bs = unhexlify(hs) return CT.sci_to_str(str(D(cls.bytes_to_num(bs)) / 100000000))
def getPreviousReviewedBook(self,reviewerId): initUrl="http://www.amazon.com/gp/cdp/member-reviews/"+reviewerId self.allRevLink=initUrl html=MyHtml.getHtml(initUrl) ftable=html.xpath('.//body/table[2]')[0] pages=ftable.xpath('./tr/td[2]/table[1]/tr[1]/td[2]/b/a[last()]') if pages is not None and len(pages)>0: totalPages=pages[0].text.strip() else: totalPages=1 strPages=str(totalPages) print strPages if "-" in strPages: totalPages=totalPages.split('-') print "totalPages",totalPages totalPages=totalPages[1] else: totalPages=totalPages print totalPages,"totalPages" sortBy='MostRecentReview' j=1 self.counter=0 flag=0 for j in range(1,(int(totalPages)+1)): baseUrl="http://www.amazon.com/gp/cdp/member-reviews/"+reviewerId baseUrl=baseUrl+ \ '?pageNumber={}&sortBy={}'.format(str(j), sortBy) print baseUrl,j,totalPages html=MyHtml.getHtml(baseUrl) ftable=html.xpath('.//body/table[2]')[0] mainTable=ftable.xpath('./tr/td[2]/table[2]/tr[@valign="top"]') for row in mainTable: if row is not None: isBook=row.xpath('./td[5]/table/tr[2]/td/b') if isBook is not None and len(isBook)>0: if isBook[0].text is not None and flag==0: edition=isBook[0].text.strip() if "Edition" in edition: print "got the previous book" flag=1 reviewdate=row.xpath('./following-sibling::*') reviewdate=reviewdate[0].xpath('.//nobr') if reviewdate: reviewdate=reviewdate[0].text.strip() print "got the reviewDate",reviewdate self.previousBookReviewDate=CommonTool.strToDate(reviewdate) #to get link of the previous reviewedbook url=row.xpath('./td[5]/table/tr[1]/td/b/a')[0].attrib['href'] asin=Book.getAsinFromUrl(url) previousBook=Book.loadBookByAsin(asin) print "asinofPrevious",asin print "previousBook",previousBook self.previousBookPublishDate=previousBook.publishDate if j==1 and self.lRevTime=='': reviewdate=mainTable[0].xpath('./following-sibling::*') reviewdate=reviewdate[0].xpath('.//nobr') print reviewdate,"lRevtime" if reviewdate: reviewdate=reviewdate[0].text.strip() self.lRevTime=CommonTool.strToDate(reviewdate) print "value of j",j if j==int(totalPages) and self.fRevTime=='': print "inside frevtime loop" reviewdate=mainTable[-1].xpath('./following-sibling::*') reviewdate=reviewdate[0].xpath('.//nobr') print reviewdate,"fRevtime" if reviewdate: reviewdate=reviewdate[0].text.strip() self.fRevTime=CommonTool.strToDate(reviewdate) # rate rateObj = row.xpath("./following-sibling::*") rate = 'N/A' if len(rateObj) != 0: rateObj1 = rateObj[0].xpath(".//img") title = rateObj1[0].attrib['title'] rate = title.split("out")[0].strip() self.sum = self.sum + float(rate) j=j+1 #end of inner for loop print "sum",self.sum if self.rNum is not 0: self.avgRate=self.sum/self.rNum self.avgRate=round(self.avgRate,2) if self.lRevTime=='' or self.fRevTime=='': duration=0 else : duration = (self.lRevTime-self.fRevTime).days self.duration=int(duration)
def solveProfileUp(self): profile = self.html.xpath("//div[@class='a-row profile-details']") if len(profile) == 0: return else: profile = profile[0] # Reviewer Name self.rName = self.html.xpath( ".//div[@class='a-section']/h1")[0].text.strip() if len(self.rName) == 0: self.rName = "N/A" # Reviewer Ranking rRank = profile.xpath(".//div[@class='profile-info']") for rank in rRank: if len(rRank) != 0: rRank = rank.xpath(".//div[@class='a-row']/span[@class='a-size-large a-text-bold']") if len(rRank) != 0: rRank = rRank[0].text.strip() if len(rRank) != 0: #rRank=rRank.split(':')[1] self.rRank = rRank self.rRank = self.rRank.replace('#', '') self.rRank = CommonTool.strToInt(self.rRank) else: self.rRank = 'N/A' else: print "entered else in ranking" rRank=profile.xpath(".//span[@class='a-size-small a-color-secondary']") print rRank for rank in rRank: if "ranking" in rank.text: rRank=rank print "aranking ",rRank if len(rRank)> 0 and rRank[0].text is not None: rRank=rRank[0].text.strip() if "#" in rRank: rRank=rRank.split('#') print "rank",rRank self.rRank = rRank[1] self.rRank = CommonTool.strToInt(self.rRank) else: self.rRank = 'N/A' else: self.rRank = 'N/A' # Top Reviewer && Vine Voice self.tRev1 = 0 self.tRev10 = 0 self.tRev50 = 0 self.tRev100 = 0 self.tRev500 = 0 self.tRev1000 = 0 self.tRevHall = 0 self.rReal=0 #self.vVoice = '0' tRev = profile.xpath( ".//span[@class='a-color-link pr-c7y-badge a-text-bold']") if len(tRev) != 0: temp = tRev[0].text.strip() if temp.find('#1 REVIEWER') != -1: self.tRev1 = 1 elif temp.find('TOP 10 REVIEWER') != -1: self.tRev10 = 1 elif temp.find('TOP 50 REVIEWER') != -1: self.tRev50 = 1 elif temp.find('TOP 100 REVIEWER') != -1: self.tRev100 = 1 elif temp.find('TOP 500 REVIEWER') != -1: self.tRev500 = 1 elif temp.find('TOP 1000 REVIEWER') != -1: self.tRev1000 = 1 elif temp.find('HALL OF FAME') != -1: self.tRevHall = 1 #elif temp.find('VINE VOICE') != -1: #self.vVoice = 1 # INTEREST self.interest = profile.xpath( ".//div[@class='a-row a-spacing-medium profile-interests']") if len(self.interest) == 0: self.interest = 'N/A' # print '(interest) No a-row a-spacing-medium profile-interests is # found!' else: self.interest = self.interest[0].xpath( "./div/span[@class='a-size-small']")[0].text.strip() self.interest = self.interest.replace('\r', '') self.interest = self.interest.replace('\n', '<br>') # ABOUT ME abtMe = profile.xpath( ".//span/text()[normalize-space(.)='About']/parent::*/\ following-sibling::div/div[1]/span/p") print abtMe,"abtme" if abtMe is not None and len(abtMe)>0: for p in abtMe: about =p.text.strip()\ .replace('\r', '').replace('\n', '<br />') self.aboutMe='' self.aboutMe=self.aboutMe+about #except LookupError: #self.aboutMe = 'N/A' # Email && webpage link1 = profile.xpath(".//div[@class='a-row break-word pr-link']/a") if len(link1) == 0: # print '(email) No a-size-small found!' self.email = 0 else: # self.email = link1[0].text.strip() self.email = 1 link2 = profile.xpath(".//div[@class='a-row customer-website pr-link']/a/span") if len(link2) == 0: self.webPage = 0 else: self.webPage = 1 # rNum rNum = profile.xpath(".//div[@class='a-column a-span7 pr-link']/a/span") if len(rNum) == 0: self.rNum = 0 else: rNum = rNum[0].text.strip() if "Reviews" in rNum: print rNum rNum = rNum.split('(') rNum=rNum[1] rNum=rNum.split(')') rNum=rNum[0] else: rNum='0' self.rNum = CommonTool.strToInt(rNum) # helpRate helpful = profile.xpath(".//div[@class='a-row customer-helpfulness']") if len(helpful) == 0: # print '(helpful) No a-size-large a-text-bold found!' self.helpRate = 0.0 else: self.helpRate = helpful[0].xpath( ".//span[@class='a-size-large a-text-bold']") if len(self.helpRate) == 0: # print '(helpRate) No a-size-large a-text-bold found!' self.helpRate = 0.0 else: self.helpRate = self.helpRate[0].text.strip() self.helpRate = int(self.helpRate[:-1]) / 100.0 # hVote && tVote votes = profile.xpath( "./span/div/div/div/span[@class='a-size-small a-color-secondary']") if len(votes) == 0: # print '(votes) No a-size-small a-color-secondary found!' self.hVote = 0 self.tVote = 0 else: votes = votes[0].text.strip() votesList = votes.split(' of ') self.hVote = votesList[0][1:] self.tVote = votesList[1][:-1] self.hVote = self.hVote.strip() self.tVote = self.tVote.strip() self.hVote = CommonTool.strToInt(self.hVote) self.tVote = CommonTool.strToInt(self.tVote) del votesList
def parse_return_type(cls, mark): if isinstance(mark, str): mark = int(CT.big_or_little(mark), 16) if isinstance(mark, int): return cls.get_arg_name(mark) raise ValueError('wrong type for return {}'.format(mark))