def read_review_data(filename): review_list = [] with open(filename) as json_file: data = json.load(json_file) for obj in data['reviews']: review = Review() review.rating = obj['rating'] review.date = obj['date'] review.text = obj['review_text'] review_list.append(review) return review_list
paths = [relpath for relpath in paths if \ relpath.find('index.html') == -1 and \ relpath != "urls" and relpath.find('.arff') == -1] for relpath in paths: path = data_dir + relpath print path soup = BeautifulSoup(open(path)) try: meta = soup.find('ul', {"class": "review-meta"}) rev = Review() rev.id = int(re.search('\d+', relpath).group(0)) rev.artist = meta.find('h1').find('a').get_text() rev.album = meta.find('h2').get_text() str_date = meta.find('span', {"class": "pub-date"}).get_text() rev.date = datetime.strptime(str_date, '%B %d, %Y') rev.score = float(meta.find('span', {"class": "score"}).get_text()) rev.text = soup.find('div', {"class": "editorial"}).get_text() db.reviews.insert(rev.__dict__) count = count + 1 print count #out.write(rev.arff_row()) #print str(rev) except Exception as e: print e print 'failed to parse ' + path #out.close()
paths = [relpath for relpath in paths if \ relpath.find('index.html') == -1 and \ relpath != "urls" and relpath.find('.arff') == -1] for relpath in paths: path = data_dir + relpath print path soup = BeautifulSoup(open(path)) try: meta = soup.find('ul', {"class":"review-meta"}) rev = Review() rev.id = int(re.search('\d+', relpath).group(0)) rev.artist = meta.find('h1').find('a').get_text() rev.album = meta.find('h2').get_text() str_date = meta.find('span', {"class":"pub-date"}).get_text() rev.date = datetime.strptime(str_date, '%B %d, %Y') rev.score = float(meta.find('span', {"class":"score"}).get_text()) rev.text = soup.find('div', {"class":"editorial"}).get_text() db.reviews.insert(rev.__dict__) count = count + 1 print count #out.write(rev.arff_row()) #print str(rev) except Exception as e: print e print 'failed to parse ' + path #out.close()