Beispiel #1
0
def main():
    # logging init
    logger = logging.getLogger("db_stahovak")
    logger.setLevel(logging.WARNING)

    # start infoo
    logger.info("START") 
    
    # classifier
    tcl = TwitterClassifier()

    # get twitter's id's - only twitter should be classified
    conn = get_connection();
    cursor = conn.cursor()

    cursor.execute("select id from sources_twitter")
    twitter_ids = [id[0] for id in cursor]

    while True:
        # feeds init
        # XXX - performance problems - sources should be before while...
        sources = MSources()
        sources.get_multi(where="_stahovak = true")
        feeds = [Sources(**data) for data in sources.value()]
        assert feeds
        items_count = 0
        for source in feeds:
            logger.info("SOURCE\tSECTION:%s\tLINK:%s" % (source.get_section(), source.get_link()))
            modified = str2tuple(source.get_modified())
            data = downloader.download(source.get_link(), 
                                       source.get_etag(), 
                                       modified)
            # update etag/modified
            if data['etag'] or data['modified']:
                diff = False
                if source.get_etag() != data['etag']:
                    diff = True
                    source.set_etag(data['etag'])
                if modified != data['modified']:
                    diff = True
                    source.set_modified(tuple2str(data['modified']))
                if diff:
                    source.update()

            classified_as_irelevant = 0
            # work with items
            for item in data['items']:
                items_count += 1
                # prepare new database insert
                Item = Documents()
                Item.set_timestamp(timer.timestamp())
                Item.set_source_id(source.get_id())
                Item.set_language(source.get_language())

                Item.set_title(control_chars.remove(item['title']))
                Item.set_text(control_chars.remove(item['text']))

                try:
                    Item.set_termvector(get_termvector(
                        Item.get_text(), Item.get_language(),
                        conn))
                except psycopg2.ProgrammingError, e:
                    print str(e)
                    continue

                Item.set__relevance(None)
                
                # we classify only twitter's documents
                if ( source.get_id() in twitter_ids ):
                    score = tcl.classify(Item.get_text(), Item.get_language())
                    was_classified = (score != -1)
                    if (was_classified and score < MIN_SCORE):
                        # skip
                        classified_as_irelevant += 1
                        continue
                    if ( was_classified ):
                        Item.set__relevance(int(score * 100))


                Item.set_link(control_chars.remove(item['link']))
                Item.set_guid(source.get_section()+":"+control_chars.remove(item['guid']))

                if item['pubDate']:
                    pubDate = time.strftime("%Y-%m-%d", item['pubDate'])
                    if pubDate : Item.set_pubDate(pubDate)
                    pubTime = time.strftime("%H:%M:%S%z", item['pubDate'])
                    if pubTime :Item.set_pubTime(pubTime)
                if not Item.get_pubDate():
                    # dont want items without pubdate
                    continue

                ## following links
                if source.get__follow():
                    url = item['link']
                    logger.debug("Following LINK:%s", url)
                    page = downloader.download_url(url)
                    Item.set_text(control_chars.remove(page.get('text', '')))
                    Item.set_html_description(control_chars.remove(page.get('description', "")))
                    Item.set_html_keywords(control_chars.remove(page.get('keywords', "")))

                # insert it
                if Item.get_text():
                    inserted, id = Item.insert()
                    if inserted:
                        logger.debug("Document succesfully inserted into db with id=%s" % Item.get_id())
                        yield str(id) # output
                    else:
                        logger.debug("Document already in db with id=%s" % id)
                else:
                    logger.info("Item has not text!")

            # outputting
            logger.info("Created OUTPUT\tITEMS:%d\tIRELEVANT:%d", data['items_count'], classified_as_irelevant)
        if not items_count:
            print "going to sleep"
            timer.sleep_second(SLEEP_TIME)
Beispiel #2
0


def download_and_insert(url, pubdate, pubtime):
    try:
        data = db_downloader.download_url(url)
    except Exception, e:
        print e
        return None
    if not data.get('text'):
        print 'db_url-stahovak: no text'
        return
    dbdoc = Documents()
    dbdoc.set_pubDate(pubdate)
    dbdoc.set_pubTime(pubtime)
    dbdoc.set_text( control_chars.remove(data['text']) )
    dbdoc.set_title( control_chars.remove(data.get('title', '')) )
    dbdoc.set_source_id(SOURCE_ID)
    dbdoc.set_language(u'en')
    dbdoc.set_timestamp(timer.timestamp())
    dbdoc.set_link( url )
    dbdoc.set_html_description( control_chars.remove(data.get('description', '')) )
    dbdoc.set_html_keywords( control_chars.remove(data.get('keywords', '')) )
    dbdoc.set_guid( GUID_PREFIX + ":" + hashlib.sha224(url).hexdigest() )
    ok, id = dbdoc.insert()
    return id




def main():
    # logging init
    logger = logging.getLogger("db_stahovak")
    logger.setLevel(logging.WARNING)

    # start infoo
    logger.info("START")

    # classifier
    tcl = TwitterClassifier()

    # get twitter's id's - only twitter should be classified
    conn = get_connection()
    cursor = conn.cursor()

    cursor.execute("select id from sources_twitter")
    twitter_ids = [id[0] for id in cursor]

    while True:
        # feeds init
        # XXX - performance problems - sources should be before while...
        sources = MSources()
        sources.get_multi(where="_stahovak = true")
        feeds = [Sources(**data) for data in sources.value()]
        assert feeds
        items_count = 0
        for source in feeds:
            logger.info("SOURCE\tSECTION:%s\tLINK:%s" %
                        (source.get_section(), source.get_link()))
            modified = str2tuple(source.get_modified())
            data = downloader.download(source.get_link(), source.get_etag(),
                                       modified)
            # update etag/modified
            if data['etag'] or data['modified']:
                diff = False
                if source.get_etag() != data['etag']:
                    diff = True
                    source.set_etag(data['etag'])
                if modified != data['modified']:
                    diff = True
                    source.set_modified(tuple2str(data['modified']))
                if diff:
                    source.update()

            classified_as_irelevant = 0
            # work with items
            for item in data['items']:
                items_count += 1
                # prepare new database insert
                Item = Documents()
                Item.set_timestamp(timer.timestamp())
                Item.set_source_id(source.get_id())
                Item.set_language(source.get_language())

                Item.set_title(control_chars.remove(item['title']))
                Item.set_text(control_chars.remove(item['text']))

                try:
                    Item.set_termvector(
                        get_termvector(Item.get_text(), Item.get_language(),
                                       conn))
                except psycopg2.ProgrammingError, e:
                    print str(e)
                    continue

                Item.set__relevance(None)

                # we classify only twitter's documents
                if (source.get_id() in twitter_ids):
                    score = tcl.classify(Item.get_text(), Item.get_language())
                    was_classified = (score != -1)
                    if (was_classified and score < MIN_SCORE):
                        # skip
                        classified_as_irelevant += 1
                        continue
                    if (was_classified):
                        Item.set__relevance(int(score * 100))

                Item.set_link(control_chars.remove(item['link']))
                Item.set_guid(source.get_section() + ":" +
                              control_chars.remove(item['guid']))

                if item['pubDate']:
                    pubDate = time.strftime("%Y-%m-%d", item['pubDate'])
                    if pubDate: Item.set_pubDate(pubDate)
                    pubTime = time.strftime("%H:%M:%S%z", item['pubDate'])
                    if pubTime: Item.set_pubTime(pubTime)
                if not Item.get_pubDate():
                    # dont want items without pubdate
                    continue

                ## following links
                if source.get__follow():
                    url = item['link']
                    logger.debug("Following LINK:%s", url)
                    page = downloader.download_url(url)
                    Item.set_text(control_chars.remove(page.get('text', '')))
                    Item.set_html_description(
                        control_chars.remove(page.get('description', "")))
                    Item.set_html_keywords(
                        control_chars.remove(page.get('keywords', "")))

                # insert it
                if Item.get_text():
                    inserted, id = Item.insert()
                    if inserted:
                        logger.debug(
                            "Document succesfully inserted into db with id=%s"
                            % Item.get_id())
                        yield str(id)  # output
                    else:
                        logger.debug("Document already in db with id=%s" % id)
                else:
                    logger.info("Item has not text!")

            # outputting
            logger.info("Created OUTPUT\tITEMS:%d\tIRELEVANT:%d",
                        data['items_count'], classified_as_irelevant)
        if not items_count:
            print "going to sleep"
            timer.sleep_second(SLEEP_TIME)
Beispiel #4
0
GUID_PREFIX = "urlentity"


def download_and_insert(url, pubdate, pubtime):
    try:
        data = db_downloader.download_url(url)
    except Exception, e:
        print e
        return None
    if not data.get('text'):
        print 'db_url-stahovak: no text'
        return
    dbdoc = Documents()
    dbdoc.set_pubDate(pubdate)
    dbdoc.set_pubTime(pubtime)
    dbdoc.set_text(control_chars.remove(data['text']))
    dbdoc.set_title(control_chars.remove(data.get('title', '')))
    dbdoc.set_source_id(SOURCE_ID)
    dbdoc.set_language(u'en')
    dbdoc.set_timestamp(timer.timestamp())
    dbdoc.set_link(url)
    dbdoc.set_html_description(
        control_chars.remove(data.get('description', '')))
    dbdoc.set_html_keywords(control_chars.remove(data.get('keywords', '')))
    dbdoc.set_guid(GUID_PREFIX + ":" + hashlib.sha224(url).hexdigest())
    ok, id = dbdoc.insert()
    return id


if __name__ == '__main__':
    """Download for each line in form 'url\tpubdate\tpubtime'"""