Ejemplo n.º 1
0
def prepare_es_dto(obj):
    #obj['similar'] = []
    #obj['seen'] = []
    obj['main'] = False

    if not 'fulltext' in obj:
        obj['fulltext'] = ''

    if not 'date_iso' in obj and isinstance(obj['date'], datetime):
        obj['date_iso'] = obj['date'].isoformat()
    elif 'date_iso' in obj and isinstance(obj['date_iso'], datetime):
        obj['date_iso'] = obj['date_iso'].isoformat()

    if not 'seen' in obj:
        obj['seen'] = []
    elif isinstance(obj['seen'], str):
        obj['seen'] = []
    elif obj['seen'] is None:
        obj['seen'] = []

    if not 'similar' in obj:
        obj['similar'] = []
    elif isinstance(obj['similar'], str):
        obj['similar'] = []
    elif obj['similar'] is None:
        obj['similar'] = []
        obj['similar'] = []

    obj['fulltext'] = clean(obj['fulltext'])
    obj['content'] = clean(obj['content'])
    obj['title'] = clean(obj['title'])

    return obj
Ejemplo n.º 2
0
def get_instance(cls, dictArticle, source):
    a = None

    try:
        if not dictArticle.description:
            return

        content = lxml.html.fromstring(dictArticle.description).text_content()
        hash_str = ':'.join([dictArticle.title,  content, source])\
                      .encode('ascii', 'ignore')
        hash = md5_constructor(hash_str).hexdigest()

        article_date = dictArticle.published_parsed
        if not article_date:
            article_date = datetime.now().isoformat()
        else:
            article_date = datetime.fromtimestamp(
                mktime(dictArticle.published_parsed)
            ).isoformat()

        a, created = cls.objects.get_or_create(link=dictArticle.link)
        if created:
            article = {
                'title': utils.clean(dictArticle.title),
                'link': dictArticle.link,
                'hash_key': hash,
                'content': utils.clean(content),
                'source': source,
                'tag': cls.__name__,
                'image_url': get_image_url(dictArticle.links),
                'date': '%s' % article_date
            }
            a.title = article['title']
            a.hash_key = article['hash_key']
            a.content = article['content']
            a.source = article['source']
            a.date = article['date']
            a.image_url = article['image_url']
            a.save()
            return article
        return None

    except:
        utils.print_exception()

    return None