def getWeightHistory(self, userId=None): users = self.convertUserIdToUserList(userId) for user in users: diet, startWeight, goalWeight, weightHistory = None, None, None, None try: if user['serverId'] is not None: weightHistoryURL = self.getURL(user, 1) page = self.br.open(weightHistoryURL) soup = BeautifulSoup(page.read()) tag = soup.find('b') diet = tag.contents[1].text tag = soup.find(attrs={'style' : 'padding:0px 10px'}) startWeight = float(tag.contents[1].split(': ')[1].split()[0]) goalWeight = float(tag.contents[0].text.split(': ')[1].split()[0]) weightList, dateList = [], [] for tag in soup.findAll(attrs={'class' : 'borderBottom date'}): dateList.append(parser.parse(tag.text)) for tag in soup.findAll(attrs={'class' : 'borderBottom weight'}): weightList.append(float(tag.text.split()[0])) weightHistory = zip(dateList, weightList) weightHistory = sorted(weightHistory, key= lambda record : record[0]) except Exception as e: logException(user['id'], self.getWeightHistory.__name__, e) finally: self.db.updateWeightHistory(user['id'], diet, startWeight, goalWeight, weightHistory)
def getBuddy(self, userId=None): users = self.convertUserIdToUserList(userId) for user in users: buddyIdList = [] try: if user['serverId'] is not None: buddyURL = self.getURL(user, 5) while True: page = self.br.open(buddyURL) soup = BeautifulSoup(page.read()) results = soup.findAll('a', attrs={'class' : 'member', 'onmouseout' : 'hideTip()'}) for tag in results: if tag.text != '': buddyName = tag.text.strip() buddy = self.db.addNewUser(buddyName) buddyIdList.append(buddy['id']) if 'serverId' not in buddy: self.getServerId(buddy['id']) result = soup.find('span', attrs={'class' : 'next'}) if result is None: break else: buddyURL = 'http://fatsecret.com/' + result.contents[0].attrs['href'] except Exception as e: logException(user['id'], self.getBuddy.__name__, e) finally: self.db.addBuddyInUser(user['id'], buddyIdList)
def run(self): try: logInfo(self.userId, 'start crawling') de = DataExtractor() de.getServerId(self.userId) de.getWeightHistory(self.userId) de.getDietHistory(self.userId) de.getGroup(self.userId) de.getChallenge(self.userId) de.getBuddy(self.userId) logInfo(self.userId, 'Done crawling') except Exception as e: logException(self.userId, self.run.__name__, e)
def getDietHistory(self, userId=None): users = self.convertUserIdToUserList(userId) for user in users: dietHistory = None try: if user['serverId'] is not None: dietHistoryURL = self.getURL(user, 2) page = self.br.open(dietHistoryURL) soup = BeautifulSoup(page.read()) months = soup.findAll('td', attrs={'colspan' : '6', 'class' : 'borderBottom'}) monthList = [] if months == []: raise Exception('no diet history records') for month in months: monthList.append(datetime.strptime(month.text, '%B %Y')) rows = soup.findAll('tr', attrs={'valign' : 'middle'}) prevDay = 32 monthIndex = 0 dietHistory = [] for row in rows: try: if len(row.contents) != 13: continue day = int(re.sub('[^0-9]', '', row.contents[1].text)) if day >= prevDay: monthIndex += 1 prevDay = day date = datetime(monthList[monthIndex].year, monthList[monthIndex].month, day) food = self.getIntFromRawString(row.contents[3].text) RDI = self.getDecimalFromPercentageString(row.contents[5].text) fat, protein, carbs = self.getDataFromNutrionalSummary(row.contents[7].text) exercise = self.getIntFromRawString(row.contents[9].text) net = self.getIntFromRawString(row.contents[11].text) dietHistory.append((date, food, RDI, fat, protein, carbs, exercise, net)) except Exception as e: logException(user['id'], self.getDietHistory.__name__, e, 'scrape row error') if 'dietHistory' in user and user['dietHistory'] is not None: dietHistory = self.mergeDietTrack(user['dietHistory'], dietHistory) else: dietHistory.sort(key=lambda item : item[0]) except Exception as e: logException(user['id'], self.getDietHistory.__name__, e) finally: self.db.updateDietHistory(user['id'], dietHistory)
def getChallenge(self, userId=None): users = self.convertUserIdToUserList(userId) for user in users: challengeIdList = [] try: if user['serverId'] is not None: challengeURL = self.getURL(user, 4) page = self.br.open(challengeURL) soup = BeautifulSoup(page.read()) results = soup.findAll('td', attrs={'width' : '50', 'align' : 'center'}) for tag in results: challengeName = tag.contents[1].attrs['title'] challenge = self.db.addNewChallenge(challengeName) self.db.addUserInChallenge(user['id'], challenge['id']) challengeIdList.append(challenge['id']) except Exception as e: logException(user['id'], self.getChallenge.__name__, e) finally: self.db.addChallengeInUser(user['id'], challengeIdList)
def getGroup(self, userId=None): users = self.convertUserIdToUserList(userId) for user in users: groupIdList = [] try: if user['serverId'] is not None: groupURL = self.getURL(user, 3) page = self.br.open(groupURL) soup = BeautifulSoup(page.read()) results = soup.findAll('td', attrs={'width' : '50', 'align' : 'center'}) for tag in results: groupName = tag.contents[1].attrs['title'] group = self.db.addNewGroup(groupName) self.db.addUserInGroup(user['id'], group['id']) groupIdList.append(group['id']) except Exception as e: logException(user['id'],self.getGroup. __name__, e) finally: self.db.addGroupInUser(user['id'], groupIdList)
def getServerId(self, userId=None): users = self.convertUserIdToUserList(userId) for user in users: if 'serverId' in user and user['serverId'] is not None: continue serverId = None try: memberURL = self.getURL(user, 0) page = self.br.open(memberURL) soup = BeautifulSoup(page.read()) result = soup.find('div', attrs={'align' : 'right', 'class' : 'smallText', 'style' : 'padding-top:5px'}) if result is not None: for tag in result.contents: if isinstance(tag, element.Tag) and 'href' in tag.attrs and tag.attrs['href'].find('id') != -1: serverId = tag.attrs['href'].split('id=')[1] break except Exception as e: logException(user['id'], self.getServerId.__name__, e) finally: self.db.updateServerId(user['id'], serverId)