def main(): try: # Read configuration file Settings.read(os.path.join(basepath, "config", "GreynirSimple.conf")) except ConfigError as e: print("Configuration error: {0}".format(e)) quit() with SessionContext(commit=True) as session: # Zero sentences print("Deleting all articles with zero sentences") res = session.execute( ArticleModel.table().delete().where(ArticleModel.num_sentences == 0) ) print(str(res.rowcount) + " articles deleted") # Non-Icelandic # TODO: Implement me! # Duplicates # For each https article, check whether there is a corresponding # article URL with http URI scheme dupl = 0 q = session.query(ArticleModel.url).filter(ArticleModel.url.like("https://%")) for r in q.all(): url = re.sub(r"^https://", r"http://", r.url) # c = session.query(ArticleModel.url).filter(ArticleModel.url == url).count() res = session.execute( ArticleModel.table().delete().where(ArticleModel.url == url) ) dupl += res.rowcount print("{0} duplicate URLs w. HTTP scheme removed".format(dupl))
def create_article(title, content, owner_login, user_suggestion): if len(content) > MAX_ARTICLE_CONTENT_LENGTH or len( title) > MAX_TITLE_LENGTH: return False user = User.get(User.name == owner_login) if user_suggestion is None: owner = user else: owner = User.get(User.name == user_suggestion) articles_count = user.articles_count html_content = markdown.markdown(content, safe_mode='escape', extensions=[TocExtension(baselevel=3)]) stripped_text = strip_html_tags(html_content) preview_text = stripped_text[:MAX_ARTICLE_PREVIEW_TEXT_LENGTH] if len(stripped_text) > MAX_ARTICLE_PREVIEW_TEXT_LENGTH: preview_text += '...' Article.create(title=html.escape(title), content=html_content, preview_text=preview_text, owner=owner, is_draft=user_suggestion is not None).save() if user_suggestion is None: User\ .update({User.articles_count: articles_count + 1})\ .where(User.id == owner)\ .execute() return True
def get_all_articles(self, query: models.Article, tag: str) -> Any: """ Get All Articles """ try: data = query.order_by(models.Article.modified_timestamp.desc()) if tag: looking_for = '%{0}%'.format(tag) data = query.filter( cast(models.Article.tags, String).ilike(looking_for)) return data except SQLAlchemyError as e: fastapi_logger.exception("get_all_articles") return None
def save(self): inputs_values = self.get_inputs_values() if self.data: self.data.code = inputs_values.get('code') self.data.designation = inputs_values.get('designation') self.data.family = inputs_values.get('familly') self.data.author = inputs_values.get('author') self.data.editor = inputs_values.get('editor') self.data.buying_price = inputs_values.get('buying_price') self.data.selling_price = inputs_values.get('selling_price') self.data.quantity = inputs_values.get('qte_stock') self.session.add(self.data) self.session.commit() inputs_values['id'] = self.data.id else: instance = save( Article( code=inputs_values.get('code'), designation=inputs_values.get('designation'), family=inputs_values.get('familly'), author=inputs_values.get('author'), editor=inputs_values.get('editor'), buying_price=inputs_values.get('buying_price'), selling_price=inputs_values.get('selling_price'), quantity=inputs_values.get('qte_stock'), )) # if instance: # return inputs_values return inputs_values
def get_article_by_id(art_id, username): try: article = Article.get_by_id(int(art_id)) if User.get(User.name == username).id != article.owner.id and article.is_draft: return None return article except (ValueError, DoesNotExist): return None
def get_article(self, query: models.Article, article_id: str): """ Get A Single article """ try: data = query.filter( models.Article.article_id == article_id).first() return data except SQLAlchemyError as e: fastapi_logger.exception("get_article") return None
def publish_article(username, article_id): article = Article.get_by_id(article_id) user = article.owner if user.name != username: return False Article\ .update({Article.is_draft: False})\ .where(Article.id == article_id)\ .execute() User\ .update({User.articles_count: user.articles_count + 1})\ .where(User.id == user.id)\ .execute() return True
def new_article(): post_data = flask.request.form article_url = post_data['link'] article = Article.query.filter_by(link=article_url).first() if article is None: # if article doesn't exist then create the article in the database article = Article(link=article_url, content=post_data['content'], title=post_data['title'], image=post_data['image'], author=post_data['author']) db.session.add(article) db.session.commit() article = Article.query.filter_by(link=article_url).first() return flask.jsonify(article_id=article.id)
def toutiao_spider(category_web, category_db): start_url = 'http://www.toutiao.com/api/pc/feed/' time_stamp = time.time() param = { "category": category_web, "utm_source": "toutiao", "widen": "1", "max_behot_time": str(time_stamp), "max_behot_time_tmp": str(time_stamp), "tadrequire": "true", "as": "A1053924239A06B", "cp": "5943CAE086FBEE1" } count = 0 for i in range(0, 100): param['max_behot_time'] = param['max_behot_time_tmp'] = str( time_stamp - i * 20 * 60) response = get_page(url=start_url, params=param) json_obj = json.loads(response) if count >= 200: break for one in json_obj.get('data'): if not one: continue count += 1 source_url = 'http://www.toutiao.com' + one.get('source_url', '') title = one.get('title', '') behot_time = one.get('behot_time', '') x = time.localtime(behot_time) behot_time = time.strftime('%Y-%m-%d %H:%M:%S', x) abstract = one.get('abstract', '') image_url = one.get('image_url', '') category = category_db web_site = '今日头条' article = Article(url=source_url, title=title, publish_time=behot_time, abstract=abstract, image=image_url, category=category, web_site=web_site) #print(title) if article and image_url: session.add(article) try: session.commit() except: #print(traceback.print_exc()) session.rollback()
def on_pushButtonImport_clicked(self): file_name = QFileDialog.getOpenFileName(self, 'Ouvrir le fichier', '', 'Text files (*.xlsx)')[0] wb = xlrd.open_workbook(file_name) # sheet = wb.active # for row in sheet.iter_rows(): # for cell in row: # print(cell.value, end=' ') # print() # print(str(sheet.max_row)) # print(str(sheet.max_column)) sheet = wb.sheet_by_index(0) # # print(sheet.cell_value(0, 0)) article_list = [] for i in range(2, sheet.nrows): # for j in range(0, sheet.ncols): article = Article( code=sheet.cell_value(i, 0), designation=sheet.cell_value(i, 1), family=sheet.cell_value(i, 2), author=sheet.cell_value(i, 3), editor=sheet.cell_value(i, 4), selling_price=sheet.cell_value(i, 5), ) article_list.append({ 'code': sheet.cell_value(i, 0), 'designation': sheet.cell_value(i, 1), 'familly': sheet.cell_value(i, 2), 'author': sheet.cell_value(i, 3), 'editor': sheet.cell_value(i, 4), 'selling_price': sheet.cell_value(i, 5) }) self.session.add(article) self.session.commit() self.articles = self.session.query(Article).all() self.article_table_model.add_articles(article_list) self.emit_tableView_layout_change_event()
def store(self, enclosing_session=None): """ Store an article in the database, inserting it or updating """ with SessionContext(enclosing_session, commit=True) as session: if self._uuid is None: # Insert a new row self._uuid = str(uuid.uuid1()) ar = ArticleRow( id=self._uuid, url=self._url, root_id=self._root_id, heading=self._heading, author=self._author, timestamp=self._timestamp, authority=self._authority, scraped=self._scraped, parsed=self._parsed, processed=self._processed, indexed=self._indexed, scr_module=self._scr_module, scr_class=self._scr_class, scr_version=self._scr_version, parser_version=self._parser_version, num_sentences=self._num_sentences, num_parsed=self._num_parsed, ambiguity=self._ambiguity, html=self._html, tree=self._tree, tokens=self._tokens, ) # Delete any existing rows with the same URL session.execute(ArticleRow.table().delete().where( ArticleRow.url == self._url)) # Add the new row with a fresh UUID session.add(ar) # Store the word stems occurring in the article self._store_words(session) # Offload the new data from Python to PostgreSQL session.flush() return True # Update an already existing row by UUID ar = (session.query(ArticleRow).filter( ArticleRow.id == self._uuid).one_or_none()) if ar is None: # UUID not found: something is wrong here... return False # Update the columns # UUID is immutable ar.url = self._url ar.root_id = self._root_id ar.heading = self._heading ar.author = self._author ar.timestamp = self._timestamp ar.authority = self._authority ar.scraped = self._scraped ar.parsed = self._parsed ar.processed = self._processed ar.indexed = self._indexed ar.scr_module = self._scr_module ar.scr_class = self._scr_class ar.scr_version = self._scr_version ar.parser_version = self._parser_version ar.num_sentences = self._num_sentences ar.num_parsed = self._num_parsed ar.ambiguity = self._ambiguity ar.html = self._html ar.tree = self._tree ar.tokens = self._tokens # If the article has been parsed, update the index of word stems # (This may cause all stems for the article to be deleted, if # there are no successfully parsed sentences in the article) self._store_words(session) # Offload the new data from Python to PostgreSQL session.flush() return True