Exemple #1
0
def main():

    try:
        # Read configuration file
        Settings.read(os.path.join(basepath, "config", "GreynirSimple.conf"))
    except ConfigError as e:
        print("Configuration error: {0}".format(e))
        quit()

    with SessionContext(commit=True) as session:

        # Zero sentences
        print("Deleting all articles with zero sentences")
        res = session.execute(
            ArticleModel.table().delete().where(ArticleModel.num_sentences == 0)
        )
        print(str(res.rowcount) + " articles deleted")

        # Non-Icelandic
        # TODO: Implement me!

        # Duplicates
        # For each https article, check whether there is a corresponding
        # article URL with http URI scheme
        dupl = 0
        q = session.query(ArticleModel.url).filter(ArticleModel.url.like("https://%"))
        for r in q.all():
            url = re.sub(r"^https://", r"http://", r.url)
            # c = session.query(ArticleModel.url).filter(ArticleModel.url == url).count()
            res = session.execute(
                ArticleModel.table().delete().where(ArticleModel.url == url)
            )
            dupl += res.rowcount
        print("{0} duplicate URLs w. HTTP scheme removed".format(dupl))
Exemple #2
0
def create_article(title, content, owner_login, user_suggestion):
    if len(content) > MAX_ARTICLE_CONTENT_LENGTH or len(
            title) > MAX_TITLE_LENGTH:
        return False
    user = User.get(User.name == owner_login)
    if user_suggestion is None:
        owner = user
    else:
        owner = User.get(User.name == user_suggestion)
    articles_count = user.articles_count
    html_content = markdown.markdown(content,
                                     safe_mode='escape',
                                     extensions=[TocExtension(baselevel=3)])
    stripped_text = strip_html_tags(html_content)
    preview_text = stripped_text[:MAX_ARTICLE_PREVIEW_TEXT_LENGTH]
    if len(stripped_text) > MAX_ARTICLE_PREVIEW_TEXT_LENGTH:
        preview_text += '...'
    Article.create(title=html.escape(title),
                   content=html_content,
                   preview_text=preview_text,
                   owner=owner,
                   is_draft=user_suggestion is not None).save()
    if user_suggestion is None:
        User\
            .update({User.articles_count: articles_count + 1})\
            .where(User.id == owner)\
            .execute()
    return True
Exemple #3
0
    def get_all_articles(self, query: models.Article, tag: str) -> Any:
        """ Get All Articles """
        try:
            data = query.order_by(models.Article.modified_timestamp.desc())

            if tag:
                looking_for = '%{0}%'.format(tag)
                data = query.filter(
                    cast(models.Article.tags, String).ilike(looking_for))

            return data
        except SQLAlchemyError as e:
            fastapi_logger.exception("get_all_articles")
            return None
Exemple #4
0
    def save(self):
        inputs_values = self.get_inputs_values()

        if self.data:
            self.data.code = inputs_values.get('code')
            self.data.designation = inputs_values.get('designation')
            self.data.family = inputs_values.get('familly')
            self.data.author = inputs_values.get('author')
            self.data.editor = inputs_values.get('editor')
            self.data.buying_price = inputs_values.get('buying_price')
            self.data.selling_price = inputs_values.get('selling_price')
            self.data.quantity = inputs_values.get('qte_stock')

            self.session.add(self.data)
            self.session.commit()
            inputs_values['id'] = self.data.id
        else:
            instance = save(
                Article(
                    code=inputs_values.get('code'),
                    designation=inputs_values.get('designation'),
                    family=inputs_values.get('familly'),
                    author=inputs_values.get('author'),
                    editor=inputs_values.get('editor'),
                    buying_price=inputs_values.get('buying_price'),
                    selling_price=inputs_values.get('selling_price'),
                    quantity=inputs_values.get('qte_stock'),
                ))
        # if instance:
        #     return inputs_values
        return inputs_values
Exemple #5
0
def get_article_by_id(art_id, username):
    try:
        article = Article.get_by_id(int(art_id))
        if User.get(User.name ==
                    username).id != article.owner.id and article.is_draft:
            return None
        return article
    except (ValueError, DoesNotExist):
        return None
Exemple #6
0
 def get_article(self, query: models.Article, article_id: str):
     """ Get A Single article """
     try:
         data = query.filter(
             models.Article.article_id == article_id).first()
         return data
     except SQLAlchemyError as e:
         fastapi_logger.exception("get_article")
         return None
Exemple #7
0
def publish_article(username, article_id):
    article = Article.get_by_id(article_id)
    user = article.owner
    if user.name != username:
        return False
    Article\
        .update({Article.is_draft: False})\
        .where(Article.id == article_id)\
        .execute()
    User\
        .update({User.articles_count: user.articles_count + 1})\
        .where(User.id == user.id)\
        .execute()
    return True
Exemple #8
0
def new_article():
    post_data = flask.request.form
    article_url = post_data['link']
    article = Article.query.filter_by(link=article_url).first()
    if article is None:  # if article doesn't exist then create the article in the database
        article = Article(link=article_url,
                          content=post_data['content'],
                          title=post_data['title'],
                          image=post_data['image'],
                          author=post_data['author'])
        db.session.add(article)
        db.session.commit()
        article = Article.query.filter_by(link=article_url).first()
    return flask.jsonify(article_id=article.id)
Exemple #9
0
def toutiao_spider(category_web, category_db):
    start_url = 'http://www.toutiao.com/api/pc/feed/'
    time_stamp = time.time()
    param = {
        "category": category_web,
        "utm_source": "toutiao",
        "widen": "1",
        "max_behot_time": str(time_stamp),
        "max_behot_time_tmp": str(time_stamp),
        "tadrequire": "true",
        "as": "A1053924239A06B",
        "cp": "5943CAE086FBEE1"
    }
    count = 0
    for i in range(0, 100):
        param['max_behot_time'] = param['max_behot_time_tmp'] = str(
            time_stamp - i * 20 * 60)
        response = get_page(url=start_url, params=param)
        json_obj = json.loads(response)
        if count >= 200:
            break
        for one in json_obj.get('data'):
            if not one:
                continue
            count += 1
            source_url = 'http://www.toutiao.com' + one.get('source_url', '')
            title = one.get('title', '')
            behot_time = one.get('behot_time', '')
            x = time.localtime(behot_time)
            behot_time = time.strftime('%Y-%m-%d %H:%M:%S', x)
            abstract = one.get('abstract', '')
            image_url = one.get('image_url', '')
            category = category_db
            web_site = '今日头条'
            article = Article(url=source_url,
                              title=title,
                              publish_time=behot_time,
                              abstract=abstract,
                              image=image_url,
                              category=category,
                              web_site=web_site)
            #print(title)
            if article and image_url:
                session.add(article)
                try:
                    session.commit()
                except:
                    #print(traceback.print_exc())
                    session.rollback()
Exemple #10
0
    def on_pushButtonImport_clicked(self):
        file_name = QFileDialog.getOpenFileName(self, 'Ouvrir le fichier', '',
                                                'Text files (*.xlsx)')[0]

        wb = xlrd.open_workbook(file_name)
        # sheet = wb.active
        # for row in sheet.iter_rows():
        #     for cell in row:
        #         print(cell.value, end=' ')
        #     print()

        # print(str(sheet.max_row))
        # print(str(sheet.max_column))

        sheet = wb.sheet_by_index(0)

        # # print(sheet.cell_value(0, 0))
        article_list = []
        for i in range(2, sheet.nrows):
            # for j in range(0, sheet.ncols):
            article = Article(
                code=sheet.cell_value(i, 0),
                designation=sheet.cell_value(i, 1),
                family=sheet.cell_value(i, 2),
                author=sheet.cell_value(i, 3),
                editor=sheet.cell_value(i, 4),
                selling_price=sheet.cell_value(i, 5),
            )
            article_list.append({
                'code': sheet.cell_value(i, 0),
                'designation': sheet.cell_value(i, 1),
                'familly': sheet.cell_value(i, 2),
                'author': sheet.cell_value(i, 3),
                'editor': sheet.cell_value(i, 4),
                'selling_price': sheet.cell_value(i, 5)
            })
            self.session.add(article)

        self.session.commit()

        self.articles = self.session.query(Article).all()
        self.article_table_model.add_articles(article_list)
        self.emit_tableView_layout_change_event()
Exemple #11
0
    def store(self, enclosing_session=None):
        """ Store an article in the database, inserting it or updating """
        with SessionContext(enclosing_session, commit=True) as session:
            if self._uuid is None:
                # Insert a new row
                self._uuid = str(uuid.uuid1())
                ar = ArticleRow(
                    id=self._uuid,
                    url=self._url,
                    root_id=self._root_id,
                    heading=self._heading,
                    author=self._author,
                    timestamp=self._timestamp,
                    authority=self._authority,
                    scraped=self._scraped,
                    parsed=self._parsed,
                    processed=self._processed,
                    indexed=self._indexed,
                    scr_module=self._scr_module,
                    scr_class=self._scr_class,
                    scr_version=self._scr_version,
                    parser_version=self._parser_version,
                    num_sentences=self._num_sentences,
                    num_parsed=self._num_parsed,
                    ambiguity=self._ambiguity,
                    html=self._html,
                    tree=self._tree,
                    tokens=self._tokens,
                )
                # Delete any existing rows with the same URL
                session.execute(ArticleRow.table().delete().where(
                    ArticleRow.url == self._url))
                # Add the new row with a fresh UUID
                session.add(ar)
                # Store the word stems occurring in the article
                self._store_words(session)
                # Offload the new data from Python to PostgreSQL
                session.flush()
                return True

            # Update an already existing row by UUID
            ar = (session.query(ArticleRow).filter(
                ArticleRow.id == self._uuid).one_or_none())
            if ar is None:
                # UUID not found: something is wrong here...
                return False

            # Update the columns
            # UUID is immutable
            ar.url = self._url
            ar.root_id = self._root_id
            ar.heading = self._heading
            ar.author = self._author
            ar.timestamp = self._timestamp
            ar.authority = self._authority
            ar.scraped = self._scraped
            ar.parsed = self._parsed
            ar.processed = self._processed
            ar.indexed = self._indexed
            ar.scr_module = self._scr_module
            ar.scr_class = self._scr_class
            ar.scr_version = self._scr_version
            ar.parser_version = self._parser_version
            ar.num_sentences = self._num_sentences
            ar.num_parsed = self._num_parsed
            ar.ambiguity = self._ambiguity
            ar.html = self._html
            ar.tree = self._tree
            ar.tokens = self._tokens
            # If the article has been parsed, update the index of word stems
            # (This may cause all stems for the article to be deleted, if
            # there are no successfully parsed sentences in the article)
            self._store_words(session)
            # Offload the new data from Python to PostgreSQL
            session.flush()
            return True