def main():
    OKDOC = 0
    conn = get_connection()
    curr = conn.cursor()
    while True:
        curr.execute("""
                SELECT id, text, sentences FROM documents 
                WHERE language in ('de', 'en')
                -- and id = 18811 -- XXX
                --AND _stanford = True
                AND sentences IS NOT NULL
                AND EXISTS (
                    SELECT 1 FROM instances
                    WHERE 
                        item_id=documents.id 
                        AND sid IS NULL
                    )
                --ORDER BY random()
                LIMIT 50""")
        count = 0
        for doc in curr:
            id, text, sents = doc
            try:
                errmsg=analyze(id, text, sents)
                if errmsg:
                    print "ERROR", errmsg
                else:
                    OKDOC += 1
            except IndexError, e:
                print "Error for id = ", id
                print "IndexError", str(e)
            count += 1
        if count == 0:
            print "FINISH"
            break
Ejemplo n.º 2
0
def main(logger):
    conn = get_connection()

    if len(sys.argv) < 2:
        print "USAGE ./master.py JOB_NUMBER [JOB_NUMBER]+\nSee job_list in source code!"
        return 1

    jobs_list = []
    numbers = []

    for number in sys.argv[1:]:
        job_list_number = int(number)
        numbers.append(job_list_number)

        job_lists = {  # Just for test 0: [(testRaise, (),{}), (testRaise, (), {}), # TEST
            #        (testReturn, (), {})],
            1: [(cron, ('crontab', ), {})],  # NORMAL
            # Obsolete 2: [(stanford, (), {'cursor' : conn.cursor(), 'sys_path' : '../db'})],
            # Obsolete 3: [(analyzed, ('../stahovak/analyzed.py',), {})],
            # Obsolete 4: [(url_stahovak, (), {'cursor' :conn.cursor(), 'sys_path' : '../stahovak'})],
        }
        jobs_list += job_lists[job_list_number]

    #handler = logging.handlers.TimedRotatingFileHandler(
    #        LOG_DIR + "+".join(sys.argv[1:]) + "-" + LOG_FILENAME, when='D', interval=1, backupCount=3)
    #handler.setFormatter(logging.Formatter(fmt='%(asctime)s\t%(levelname)s\t%(name)s\t%(message)s'))
    #logger.addHandler(handler)

    logger.info('Starting with jobs {%s}' % ', '.join(map(str, numbers)))

    jobs = []
    for i, todo in enumerate(jobs_list):
        (target, args, kwargs) = todo
        kwargs['__process'] = i
        if multiprocessing:
            p = multiprocessing.Process(target=target,
                                        args=args,
                                        kwargs=kwargs)
        else:
            p = threading.Thread(target=target, args=args, kwargs=kwargs)
        jobs.append((p, todo))
        p.start()
    for job, (etc) in jobs:
        job.join()
    return 0
Ejemplo n.º 3
0
def main(logger):
    conn = get_connection()

    if len(sys.argv) < 2:
        print "USAGE ./master.py JOB_NUMBER [JOB_NUMBER]+\nSee job_list in source code!"
        return 1

    jobs_list = []
    numbers = []

    for number in sys.argv[1:]:
        job_list_number = int(number)
        numbers.append(job_list_number)
        
        job_lists = {# Just for test 0: [(testRaise, (),{}), (testRaise, (), {}), # TEST
                #        (testReturn, (), {})],
                     1: [(cron, ('crontab',), {})],               # NORMAL
                     # Obsolete 2: [(stanford, (), {'cursor' : conn.cursor(), 'sys_path' : '../db'})],      
                     # Obsolete 3: [(analyzed, ('../stahovak/analyzed.py',), {})],
                     # Obsolete 4: [(url_stahovak, (), {'cursor' :conn.cursor(), 'sys_path' : '../stahovak'})],
                     }
        jobs_list += job_lists[job_list_number]

    #handler = logging.handlers.TimedRotatingFileHandler(
    #        LOG_DIR + "+".join(sys.argv[1:]) + "-" + LOG_FILENAME, when='D', interval=1, backupCount=3)
    #handler.setFormatter(logging.Formatter(fmt='%(asctime)s\t%(levelname)s\t%(name)s\t%(message)s'))
    #logger.addHandler(handler)

    logger.info('Starting with jobs {%s}' % ', '.join(map(str, numbers)))

    jobs = []
    for i, todo in enumerate(jobs_list):
        (target, args, kwargs) = todo
        kwargs['__process'] = i
        if multiprocessing:
            p = multiprocessing.Process(target=target, args=args, kwargs=kwargs)
        else:
            p = threading.Thread(target=target, args=args, kwargs=kwargs)
        jobs.append((p, todo))
        p.start()
    for job, (etc) in jobs:
        job.join()
    return 0
Ejemplo n.º 4
0
def main():
    OKDOC = 0
    conn = get_connection()
    curr = conn.cursor()
    while True:
        curr.execute(
            """
                SELECT id, text, sentences FROM documents 
                WHERE language in ('de', 'en')
                -- and id = 18811 -- XXX
                --AND _stanford = True
                AND sentences IS NOT NULL
                AND EXISTS (
                    SELECT 1 FROM instances
                    WHERE 
                        item_id=documents.id 
                        AND sid IS NULL
                    )
                --ORDER BY random()
                LIMIT 50"""
        )
        count = 0
        for doc in curr:
            id, text, sents = doc
            try:
                errmsg = analyze(id, text, sents)
                if errmsg:
                    print "ERROR", errmsg
                else:
                    OKDOC += 1
            except IndexError, e:
                print "Error for id = ", id
                print "IndexError", str(e)
            count += 1
        if count == 0:
            print "FINISH"
            break
Ejemplo n.º 5
0
            else:
                occ[id] = 1
    pairs = []
    for id, counts in occ.iteritems():
        pairs.append((id, counts))
    # sort and make string
    restokens = []
    for id, occ_token in sorted(pairs, key=lambda a: a[0]):
        restokens.append("%s:%d" % (id, occ_token))
    termvector = " ".join(restokens)
    return termvector


if __name__ == "__main__":
    LIMIT = 20
    conn = get_connection(UNICODE=True)
    # conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
    print "Analyzing whole database"
    sys.stdout.flush()
    while True:
        curr = conn.cursor()
        curr.execute(
            """SELECT id, language, text FROM documents WHERE termvector is null and language in ('en', 'de') 
                and pubdate>='01-05-2011' and pubdate<'01-07-2011' 
                LIMIT %s""",
            (LIMIT,),
        )
        # curr.execute("""SELECT id, text FROM documents WHERE id=20875243""")
        count = 0
        for document in curr:
            id, lang, text = document
Ejemplo n.º 6
0
    # tuples to lists
    data = map(list, data)
    return data


if __name__ == '__main__':
    data = {}
    print "DATA FETCHING"
    for name, v in config.iteritems():
        url = v[0]
        data[name] = load(url)
        print "\t%s : %d" % (name, len(data[name]))

    print
    print "DB INTERACT"
    conn = get_connection()

    logfile = open(LOG_FILE, 'a')
    logfile.write("=" * 80)
    logfile.write("\nSTART (" + str(timer.timestamp()) + ')\n')
    outfile = open(OUT_FOLDER + str(timer.timestamp()) + '.csv', 'wb')
    csv_writer = UnicodeWriter(outfile)
    results = {}
    for name, values in data.iteritems():
        cols = config[name][1]
        cur = conn.cursor()
        cur.execute('select %s from %s' % (', '.join(cols), name))
        in_db = cur.fetchall()

        cur_insert = conn.cursor()
        print "\ttable : %s" % name
Ejemplo n.º 7
0
    data = set(data)
    # tuples to lists
    data = map(list, data)
    return data

if __name__ == '__main__':
    data = {}
    print "DATA FETCHING"
    for name, v in config.iteritems():
        url = v[0]
        data[name] = load(url)
        print "\t%s : %d" % (name, len(data[name]))

    print
    print "DB INTERACT"
    conn = get_connection()

    logfile = open( LOG_FILE, 'a')
    logfile.write("="*80)
    logfile.write("\nSTART (" + str(timer.timestamp()) + ')\n')
    outfile = open( OUT_FOLDER + str(timer.timestamp()) + '.csv', 'wb')
    csv_writer = UnicodeWriter(outfile)
    results = {}
    for name, values in data.iteritems():
        cols = config[name][1]
        cur = conn.cursor()
        cur.execute('select %s from %s' % (', '.join(cols), name))
        in_db = cur.fetchall()

        cur_insert = conn.cursor()
        print "\ttable : %s" % name
Ejemplo n.º 8
0
def main():
    # logging init
    logger = logging.getLogger("db_stahovak")
    logger.setLevel(logging.WARNING)

    # start infoo
    logger.info("START") 
    
    # classifier
    tcl = TwitterClassifier()

    # get twitter's id's - only twitter should be classified
    conn = get_connection();
    cursor = conn.cursor()

    cursor.execute("select id from sources_twitter")
    twitter_ids = [id[0] for id in cursor]

    while True:
        # feeds init
        # XXX - performance problems - sources should be before while...
        sources = MSources()
        sources.get_multi(where="_stahovak = true")
        feeds = [Sources(**data) for data in sources.value()]
        assert feeds
        items_count = 0
        for source in feeds:
            logger.info("SOURCE\tSECTION:%s\tLINK:%s" % (source.get_section(), source.get_link()))
            modified = str2tuple(source.get_modified())
            data = downloader.download(source.get_link(), 
                                       source.get_etag(), 
                                       modified)
            # update etag/modified
            if data['etag'] or data['modified']:
                diff = False
                if source.get_etag() != data['etag']:
                    diff = True
                    source.set_etag(data['etag'])
                if modified != data['modified']:
                    diff = True
                    source.set_modified(tuple2str(data['modified']))
                if diff:
                    source.update()

            classified_as_irelevant = 0
            # work with items
            for item in data['items']:
                items_count += 1
                # prepare new database insert
                Item = Documents()
                Item.set_timestamp(timer.timestamp())
                Item.set_source_id(source.get_id())
                Item.set_language(source.get_language())

                Item.set_title(control_chars.remove(item['title']))
                Item.set_text(control_chars.remove(item['text']))

                try:
                    Item.set_termvector(get_termvector(
                        Item.get_text(), Item.get_language(),
                        conn))
                except psycopg2.ProgrammingError, e:
                    print str(e)
                    continue

                Item.set__relevance(None)
                
                # we classify only twitter's documents
                if ( source.get_id() in twitter_ids ):
                    score = tcl.classify(Item.get_text(), Item.get_language())
                    was_classified = (score != -1)
                    if (was_classified and score < MIN_SCORE):
                        # skip
                        classified_as_irelevant += 1
                        continue
                    if ( was_classified ):
                        Item.set__relevance(int(score * 100))


                Item.set_link(control_chars.remove(item['link']))
                Item.set_guid(source.get_section()+":"+control_chars.remove(item['guid']))

                if item['pubDate']:
                    pubDate = time.strftime("%Y-%m-%d", item['pubDate'])
                    if pubDate : Item.set_pubDate(pubDate)
                    pubTime = time.strftime("%H:%M:%S%z", item['pubDate'])
                    if pubTime :Item.set_pubTime(pubTime)
                if not Item.get_pubDate():
                    # dont want items without pubdate
                    continue

                ## following links
                if source.get__follow():
                    url = item['link']
                    logger.debug("Following LINK:%s", url)
                    page = downloader.download_url(url)
                    Item.set_text(control_chars.remove(page.get('text', '')))
                    Item.set_html_description(control_chars.remove(page.get('description', "")))
                    Item.set_html_keywords(control_chars.remove(page.get('keywords', "")))

                # insert it
                if Item.get_text():
                    inserted, id = Item.insert()
                    if inserted:
                        logger.debug("Document succesfully inserted into db with id=%s" % Item.get_id())
                        yield str(id) # output
                    else:
                        logger.debug("Document already in db with id=%s" % id)
                else:
                    logger.info("Item has not text!")

            # outputting
            logger.info("Created OUTPUT\tITEMS:%d\tIRELEVANT:%d", data['items_count'], classified_as_irelevant)
        if not items_count:
            print "going to sleep"
            timer.sleep_second(SLEEP_TIME)
Ejemplo n.º 9
0
def main():
    # logging init
    logger = logging.getLogger("db_stahovak")
    logger.setLevel(logging.WARNING)

    # start infoo
    logger.info("START")

    # classifier
    tcl = TwitterClassifier()

    # get twitter's id's - only twitter should be classified
    conn = get_connection()
    cursor = conn.cursor()

    cursor.execute("select id from sources_twitter")
    twitter_ids = [id[0] for id in cursor]

    while True:
        # feeds init
        # XXX - performance problems - sources should be before while...
        sources = MSources()
        sources.get_multi(where="_stahovak = true")
        feeds = [Sources(**data) for data in sources.value()]
        assert feeds
        items_count = 0
        for source in feeds:
            logger.info("SOURCE\tSECTION:%s\tLINK:%s" %
                        (source.get_section(), source.get_link()))
            modified = str2tuple(source.get_modified())
            data = downloader.download(source.get_link(), source.get_etag(),
                                       modified)
            # update etag/modified
            if data['etag'] or data['modified']:
                diff = False
                if source.get_etag() != data['etag']:
                    diff = True
                    source.set_etag(data['etag'])
                if modified != data['modified']:
                    diff = True
                    source.set_modified(tuple2str(data['modified']))
                if diff:
                    source.update()

            classified_as_irelevant = 0
            # work with items
            for item in data['items']:
                items_count += 1
                # prepare new database insert
                Item = Documents()
                Item.set_timestamp(timer.timestamp())
                Item.set_source_id(source.get_id())
                Item.set_language(source.get_language())

                Item.set_title(control_chars.remove(item['title']))
                Item.set_text(control_chars.remove(item['text']))

                try:
                    Item.set_termvector(
                        get_termvector(Item.get_text(), Item.get_language(),
                                       conn))
                except psycopg2.ProgrammingError, e:
                    print str(e)
                    continue

                Item.set__relevance(None)

                # we classify only twitter's documents
                if (source.get_id() in twitter_ids):
                    score = tcl.classify(Item.get_text(), Item.get_language())
                    was_classified = (score != -1)
                    if (was_classified and score < MIN_SCORE):
                        # skip
                        classified_as_irelevant += 1
                        continue
                    if (was_classified):
                        Item.set__relevance(int(score * 100))

                Item.set_link(control_chars.remove(item['link']))
                Item.set_guid(source.get_section() + ":" +
                              control_chars.remove(item['guid']))

                if item['pubDate']:
                    pubDate = time.strftime("%Y-%m-%d", item['pubDate'])
                    if pubDate: Item.set_pubDate(pubDate)
                    pubTime = time.strftime("%H:%M:%S%z", item['pubDate'])
                    if pubTime: Item.set_pubTime(pubTime)
                if not Item.get_pubDate():
                    # dont want items without pubdate
                    continue

                ## following links
                if source.get__follow():
                    url = item['link']
                    logger.debug("Following LINK:%s", url)
                    page = downloader.download_url(url)
                    Item.set_text(control_chars.remove(page.get('text', '')))
                    Item.set_html_description(
                        control_chars.remove(page.get('description', "")))
                    Item.set_html_keywords(
                        control_chars.remove(page.get('keywords', "")))

                # insert it
                if Item.get_text():
                    inserted, id = Item.insert()
                    if inserted:
                        logger.debug(
                            "Document succesfully inserted into db with id=%s"
                            % Item.get_id())
                        yield str(id)  # output
                    else:
                        logger.debug("Document already in db with id=%s" % id)
                else:
                    logger.info("Item has not text!")

            # outputting
            logger.info("Created OUTPUT\tITEMS:%d\tIRELEVANT:%d",
                        data['items_count'], classified_as_irelevant)
        if not items_count:
            print "going to sleep"
            timer.sleep_second(SLEEP_TIME)
Ejemplo n.º 10
0
def main():
    conn = get_connection(UNICODE=True)
    curr = conn.cursor()
    tokenizer = TreebankWordTokenizer()

    while True:
        curr.execute("""SELECT id, text, language FROM documents 
                WHERE
                --guid='tw:122144569302323201'
                EXISTS ( SELECT 1 FROM instances WHERE item_id=documents.id AND begintoken IS NULL)
                LIMIT 1""")
        data = curr.fetchone()
        if data is None:
            print "sleep"
            timer.sleep_minute(30)
            continue
        id, text, lang = data
        print "id", id
        curr.execute("""SELECT * FROM instances
                WHERE item_id = %s
                AND begintoken IS NULL""", (id,))
        # throw away `confidence`
        instances = [list(x)[:-1] for x in curr]
        if not len(instances):
            continue
        instance_ = []
        for ins in instances:
            ins[-1] = None
            ins[-2] = None
            ins[-3] = None
            instance_.append(ins)
        instances = instance_
        #print instances

        sent_tok = PunktSentenceTokenizer()

        for sid, sentidx in enumerate(sent_tok.span_tokenize(text)):
            #print '++++'
            sentence = text[sentidx[0]:sentidx[1]]
            #print sentence
            #print '----'
            for pos, indexes in enumerate(WhitespaceTokenizer().span_tokenize(sentence)):
                # TODO indexy jsou pouze relativni k vete
                # ale instances je ma od zacatku!
                indexes = list(indexes)
                indexes[0] = sentidx[0] + indexes[0]
                indexes[1] = sentidx[0] + indexes[1]
                word = text[indexes[0]:indexes[1]]
                #print pos, word, indexes

                for i, instance in enumerate(instances):
                    id, entity_id, item_id, exact, offset, length, sid_, begin, end  =instance
                    #print i,instance
                    if sid_ is None:
                        if begin is None:
                            if offset >= indexes[0] and offset <= indexes[1]:
                                instances[i][-2] = begin = pos
                                instances[i][-3] = sid_ = sid
                    if sid_ == sid:
                        if end is None and begin is not None:
                            off = offset + length
                            if off <= indexes[1] and off >= indexes[0]:
                                instances[i][-1] = pos
                                if off == indexes[0]:
                                    instances[i][-1] = pos - 1
        for instance in instances:
            print instance
            id, entity_id, item_id, exact, offset, length, sid, begin, end =instance
            #print exact, ">>", sid, begin, end
            if end is None:
                if not " " in exact:
                    end = begin
                else:
                    end = -1
            curr.execute("""UPDATE instances
                    SET sid=%s, begintoken=%s, endtoken=%s
                    WHERE id=%s""", (sid, begin, end, id))
def main():
    conn = get_connection(UNICODE=True)
    curr = conn.cursor()
    tokenizer = TreebankWordTokenizer()

    while True:
        curr.execute("""SELECT id, text, language FROM documents 
                WHERE
                --guid='tw:122144569302323201'
                EXISTS ( SELECT 1 FROM instances WHERE item_id=documents.id AND begintoken IS NULL)
                LIMIT 1""")
        data = curr.fetchone()
        if data is None:
            print "sleep"
            timer.sleep_minute(30)
            continue
        id, text, lang = data
        print "id", id
        curr.execute(
            """SELECT * FROM instances
                WHERE item_id = %s
                AND begintoken IS NULL""", (id, ))
        # throw away `confidence`
        instances = [list(x)[:-1] for x in curr]
        if not len(instances):
            continue
        instance_ = []
        for ins in instances:
            ins[-1] = None
            ins[-2] = None
            ins[-3] = None
            instance_.append(ins)
        instances = instance_
        #print instances

        sent_tok = PunktSentenceTokenizer()

        for sid, sentidx in enumerate(sent_tok.span_tokenize(text)):
            #print '++++'
            sentence = text[sentidx[0]:sentidx[1]]
            #print sentence
            #print '----'
            for pos, indexes in enumerate(
                    WhitespaceTokenizer().span_tokenize(sentence)):
                # TODO indexy jsou pouze relativni k vete
                # ale instances je ma od zacatku!
                indexes = list(indexes)
                indexes[0] = sentidx[0] + indexes[0]
                indexes[1] = sentidx[0] + indexes[1]
                word = text[indexes[0]:indexes[1]]
                #print pos, word, indexes

                for i, instance in enumerate(instances):
                    id, entity_id, item_id, exact, offset, length, sid_, begin, end = instance
                    #print i,instance
                    if sid_ is None:
                        if begin is None:
                            if offset >= indexes[0] and offset <= indexes[1]:
                                instances[i][-2] = begin = pos
                                instances[i][-3] = sid_ = sid
                    if sid_ == sid:
                        if end is None and begin is not None:
                            off = offset + length
                            if off <= indexes[1] and off >= indexes[0]:
                                instances[i][-1] = pos
                                if off == indexes[0]:
                                    instances[i][-1] = pos - 1
        for instance in instances:
            print instance
            id, entity_id, item_id, exact, offset, length, sid, begin, end = instance
            #print exact, ">>", sid, begin, end
            if end is None:
                if not " " in exact:
                    end = begin
                else:
                    end = -1
            curr.execute(
                """UPDATE instances
                    SET sid=%s, begintoken=%s, endtoken=%s
                    WHERE id=%s""", (sid, begin, end, id))
Ejemplo n.º 12
0
        self.stream.write(data)
        # empty queue
        self.queue.truncate(0)

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# data output
fileDataDiseases = UnicodeWriter(open(FOLDER + 'diseases_list.dat', 'w'),
                                 dialect=csv.excel)
fileDataSymptoms = UnicodeWriter(open(FOLDER + 'symptoms_list.dat', 'w'),
                                 dialect=csv.excel)

conn = get_connection(UNICODE=True)
cur = conn.cursor()
today = '%s' % date.today()

## count douments by languages
#cur.execute("""select language, count(*) from documents group by language""")
#for lang, count in cur:
#    f = open(FOLDER + 'languages-%s.dat' % lang, 'a')
#    f.write("%s\t%d\n" % ( today, count ))
#    f.flush()
#    f.close()
#cur.execute("""select count(*) from documents where language not in ('en', 'de')""")
#(count,) = cur.fetchone()
#f = open(FOLDER + 'languages-others.dat', 'a')
#f.write("%s\t%s\n" % (today, count))
#f.flush()