def extractStatDataFromScript(self, script): lines = script.split('\n') dateList = [] dataList = [] for line in lines: if line.find('categories: [') != -1: dateLine = line[line.find('[') + 1 : line.find(']') - 1] dateList = [IMVDBDateStringToDate(dateString) for dateString in dateLine.split(',')] elif line.find('data: [') != -1: dataLine = line[line.find('[') + 1 : line.find(']') - 1] dataList = [int(cleanUnicode(dataValue)) for dataValue in dataLine.split(',')] break rawDataList = zip(dateList, dataList) return self.filterDataByWeek(rawDataList)
def extractDetailStatData(self, tables, URL): detailStatDict = {'week' : dateToSaturday(datetime.today()), 'URL' : URL} for table in tables: tableText = cleanUnicode(table.text) if tableText.find('Views') != -1: detailStatDict['MVViewCount'] = self.getDetailStatTableData(tableText, 'Views') detailStatDict['MVCommentCount'] = self.getDetailStatTableData(tableText, 'Comments') else: detailStatDict['FBLikeCount'] = self.getDetailStatTableData(tableText, 'Facebook Like Count') detailStatDict['FBShareCount'] = self.getDetailStatTableData(tableText, 'Facebook Share Count') detailStatDict['FBCommentCount'] = self.getDetailStatTableData(tableText, 'Facebook Comment Count') detailStatDict['TwitterCount'] = self.getDetailStatTableData(tableText, 'Twitter') detailStatDict['GooglePlusCount'] = self.getDetailStatTableData(tableText, 'GooglePlusOne') return detailStatDict
def extractContent(self, textDict): try: page = urllib2.urlopen(textDict["URL"]) soup = BeautifulSoup(page.read()) if textDict["type"] == "article": body = soup.find(attrs={"class": "article-body"}) else: body = soup.find(attrs={"class": "entry"}) text = "" for content in body.contents: # iterate among body, check if it's tag class, and name is <p> if "Tag" in type(content).__name__ and content.name == "p": text += content.text except Exception as e: # the URL link maybe invalid print e text = "" return cleanUnicode(text)