def db_objects_to_articles(db_objects): """ Converts db_objects from database.py -> retrieve_articles() :param db_objects: List of dicts :return: List of ArticleInfo instances """ articles = [] for db_object in db_objects: comments = [] for comment in json.loads(db_object["Comments"]): comment_object = Comment() comment_object.author = comment["Author"] comment_object.city = comment["City"] comment_object.text = comment["Text"] comment_object.likes = comment["Likes"] comment_object.dislikes = comment["Dislikes"] comment_object.ratio = float(comment["Ratio"]) comment_object.time = comment["Time"] comments.append(comment_object) articles.append( ArticleInfo(db_object["Link"], db_object["Header"], db_object["Description"], db_object["Category"], db_object["Author"], db_object["Published_at"], db_object["Modified_at"], db_object["Paragraphs"], comments)) return articles
def grab_best_comments(): comment_count = 5 if 'len' in request.args: comment_count = int(request.args.get("len")) articles = retrieve_newest_articles() # finds all comments and sorts them comments = Comment.find_comments(articles) sorted_comments = sorted(comments, key=lambda obj: (obj.ratio, obj.likes), reverse=True) return jsonify(Comment.as_dicts(sorted_comments[:comment_count]))
def parse_comments(response): """ Retrieves comments from article :param response: Scrapy response :return: Comments as list of Comment objects """ article = response.meta.get("article_object") comments = [] # parses all info about comments from comments page (xpath needed here to better access to elements) author_texts = response.xpath( "//a[@data-dot='souhlasim']/../../div/div/div/text()").getall() texts = response.xpath( "//a[@data-dot='souhlasim']/../../../../div/div/text()").getall() likes = response.xpath("//a[@data-dot='souhlasim']/span/text()").getall() dislikes = response.xpath( "//a[@data-dot='nesouhlasim']/span/text()").getall() times = response.xpath( "//a[@data-dot='souhlasim']/../../div/div/div/span/text()").getall() for i in range(len(likes)): comments.append( Comment(author_texts[i], texts[i], likes[i], dislikes[i], times[i])) article.comments = comments
def as_dict(self): return { 'Link': self.link, 'Header': self.header, 'Description': self.description, 'Category': self.category, 'Author': self.author, 'Published_at': self.published_at, 'Modified_at': self.modified_at, 'Paragraphs': self.paragraphs, 'Paragraphs_count': len(self.paragraphs), 'Comments': json.dumps(Comment.as_dicts(self.comments), ensure_ascii=False) }
threads = [] corpus = Corpus("") for thread in threadsOrigin: threadId = thread['THREAD_SEQUENCE'] t = Thread(threadId) qSub = thread.RelQuestion.RelQSubject.string qBody = thread.RelQuestion.RelQBody.string q = Question(qSub, qBody) i = 1 comments = thread.find_all('RelComment') for comment in comments: commentId = comment['RELC_ID'] username = comment['RELC_USERNAME'] relevance = comment['RELC_RELEVANCE2RELQ'] body = comment.RelCText.string c = Comment(commentId, username, relevance, body, i) q.add_comment(c) i += 1 corpus.addCorpusText(body) obo.setQuestionDictionaries(q) t.set_question(q) threads.append(t) obo.setCorpusDictionary(corpus) for t in threads: q = t.question freq.percentageQuestion(corpus, q) filter.commentRang(q) # print("question: " + str(t.id) + ", percentage: " + str(q.percentage))
def parse(self, soup, indent=0): comments = [] for comment_obj in soup.findAll("div", { "class": "comment", "data-indent": indent }): comment = Comment() comment.set_author_name( comment_obj.findAll( "div", {"class": "comment__user"})[0]['data-name']) comment.set_author_url( self.main_url + comment_obj.findAll("a", {"class": "user"})[0]['href']) comment.set_datetime( comment_obj.findAll( "time", {"class": "comment__datetime"})[0]['datetime']) comment.set_link( comment_obj.findAll("a", { "class": "comment__tool", "data-role": "link" })[0]['href']) comment.set_rating( comment_obj.findAll( "div", {"class": "comment__rating-count"})[0].text) comment.set_formatted_text( self._clean_text( comment_obj.findAll( "div", {"class": "comment__content"})[0].text)) comment.set_pluses([ int(s) for s in comment_obj.findAll( "div", {"class": "comment__rating-count"})[0] ['aria-label'].split() if s.isdigit() ][0]) #comment.set_minuses([int(s) for s in comment_obj.findAll("div", {"class": "comment__rating-count"})[0]['aria-label'].split() if s.isdigit()][1]) comment.set_subcomments( self.parse( comment_obj.findAll("div", {"class": "comment__children"} )[0], indent + 1) if self._has_subcomments(comment_obj) else []) comment.set_comment_level(indent) comments.append(comment) return comments
def test_stops_parsing(self): comment = Comment("<!-- A comment which should end here--> And more text") unparsed_xml = comment.parse_to_end({}) self.assertEqual(" And more text", unparsed_xml)
def test_triple_hyphen(self): comment = Comment(f"<!--A comment with ---> Text <end/>") with self.assertRaises(XMLError): comment.parse_to_end({})
def test_forbidden_characters(self): for char in ["\u0001", "\u0003", "\u0010", "\ufffe", "\uffff"]: with self.subTest(f"Char: {char}"): comment = Comment(f"<!--A comment with {char} --> Text <end/>") with self.assertRaises(XMLError): comment.parse_to_end({})