def main():
  db,cur = cleanDb.openConnection()
  
  dest_dir = sys.argv[1]
  languages = {lang:(liso,lname,set(tlds.split(','))) for lang,liso,lname,tlds in [x.split(':') for x in sys.argv[2:]]}
  
  tbl = open(dest_dir + '/table.txt', 'w')
  for lang in languages:
    liso,lname,tlds = languages[lang]
    print 'begin: ',lang,tlds	# tlds = set of tlds, lang = lang_altcode
    Qparams = {'tlds':tuple(tlds), 'excluded':tuple(excluded_languages - {lang} - known_fails.get(lang,set())), 'limit': 1000}
    print "Q params", Qparams
    cur.execute(Q, Qparams)
    print cur.rowcount, "rows"
    s = cur.fetchone()[0]
    s = re.sub(r"(\s|[0-9])+", " ", s, flags=re.MULTILINE)
    print 'strlen =', len(s)
    #print s
    #raw_input()
    
    tgms = defaultdict(int)
    for tgm in trigrams(s):
      tgms[tgm] += 1
    #print tgms
    
    tbl.write('%s\t%s\t%s\n' % (lang,liso,lname))
    codecs.open((dest_dir+'/%s-3grams.txt') % lang,'w', encoding='utf-8').write('\n'.join("%d %s"%(tgms[tgm],tgm) for tgm in tgms))
    
    print 'end'
Example #2
0
def enrycher_worker(in_queue, out_queue, url=None):
    """
	Worker thread. Takes an article dict from in_queue, adds the enrycher xml,
	puts the enryched article in out_queue.
	If `url` is given, queries Enrycher at that URL, otherwise the URL is constructed
	based on the language of each artcile in in_queue.
	"""
    conn, cur = openConnection('rych info writer')
    while True:
        try:
            article = in_queue.get()
            lang = article.get('lang', '').split('-')[0]

            # auto-detect URL
            if not url:
                if lang in ('en', 'eng', 'enz'):
                    if 0 and article.get('google_story_id'):
                        url = 'http://aidemo.ijs.si:8080/EnrycherWeb-render/run-render'  # all + stanford parses + sentiment
                    else:
                        url = 'http://aidemo.ijs.si:8080/EnrycherWeb-render/run-demo'
                elif lang in ('sl', 'slv'):
                    url = 'http://aidemo.ijs.si:8080/EnrycherWeb-render/sl-run'
                else:
                    raise ValueError('Unsupported language: %r' % lang)

            #print '[%s] pre-enrych %s' % (threading.currentThread().name, article['id'])
            #print article['id'], lang, `article.get('google_story_id')`, url
            article['rych'] = enrych(article['cleartext'], url)
            #print '[%s] pre-db %s' % (threading.currentThread().name, article['id'])
            DB_write_rych_info(cur, article)
            #print '[%s] pre-out-enqueue %s' % (threading.currentThread().name, article['id'])
            out_queue.put(article)

        except Exception as exc:
            # pass through the unenryched article
            out_queue.put(article)

            # report error
            print '!! error while processing article %s (lang %s) at %r' % (
                article.get('id'), article.get('lang'), url)
            txt = article.get('cleartext', '').replace('\n', ' ')
            print 'Some stats about the input data: %d bytes, %d sentences, max sentence length %d bytes. File saved to /tmp/bad_enrycher_input' % (
                len(txt), len(
                    txt.split('. ')), max(map(len, txt.split('. ')) + [-1]))
            print exc, exc.args
            try:
                with open('/tmp/bad_enrycher_input', 'w') as badf:
                    badf.write(txt.encode('utf8'))
            except:
                print '(file not saved, IOError)'
def enrycher_worker(in_queue, out_queue, url=None):
	"""
	Worker thread. Takes an article dict from in_queue, adds the enrycher xml,
	puts the enryched article in out_queue.
	If `url` is given, queries Enrycher at that URL, otherwise the URL is constructed
	based on the language of each artcile in in_queue.
	"""
	conn, cur = openConnection('rych info writer')
	while True:
		try:
			article = in_queue.get()
			lang = article.get('lang','').split('-')[0]

			# auto-detect URL
			if not url:
				if lang in ('en','eng','enz'):
					if 0 and article.get('google_story_id'):
						url = 'http://aidemo.ijs.si:8080/EnrycherWeb-render/run-render'  # all + stanford parses + sentiment 
					else:
						url = 'http://aidemo.ijs.si:8080/EnrycherWeb-render/run-demo'
				elif lang in ('sl','slv'):
					url = 'http://aidemo.ijs.si:8080/EnrycherWeb-render/sl-run'
				else:
					raise ValueError('Unsupported language: %r' % lang)

			#print '[%s] pre-enrych %s' % (threading.currentThread().name, article['id'])
			#print article['id'], lang, `article.get('google_story_id')`, url
			article['rych'] = enrych(article['cleartext'], url)
			#print '[%s] pre-db %s' % (threading.currentThread().name, article['id'])
			DB_write_rych_info(cur, article)
			#print '[%s] pre-out-enqueue %s' % (threading.currentThread().name, article['id'])
			out_queue.put(article)

		except Exception as exc:
			# pass through the unenryched article
			out_queue.put(article)

			# report error
			print '!! error while processing article %s (lang %s) at %r' % (article.get('id'), article.get('lang'), url)
			txt = article.get('cleartext', '').replace('\n',' ')
			print 'Some stats about the input data: %d bytes, %d sentences, max sentence length %d bytes. File saved to /tmp/bad_enrycher_input' % (
				len(txt), len(txt.split('. ')), max(map(len,txt.split('. '))+[-1]) )
			print exc, exc.args
			try:
				with open('/tmp/bad_enrycher_input','w') as badf:
					badf.write(txt.encode('utf8'))
			except:
				print '(file not saved, IOError)'
Example #4
0
def main():
    global article, conn
    from cleanDb import openConnection
    conn, cur = openConnection('cleartext feed')
    conn.set_isolation_level(
        psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT
    )  # disable transactions, so we get notifys realtime
    cur_listen = conn.cursor()
    cur_listen.execute("LISTEN have_cleartext")

    zmqctx = zmq.Context()
    sock = zmqctx.socket(zmq.PUB)
    sock.setsockopt(zmq.HWM, 100)
    sock.bind('tcp://*:13371')

    try:
        while True:
            if select.select([conn], [], [], 5) == ([], [], []):
                print '(nothing to do)'
            else:
                conn.poll()
                notifies = conn.notifies
                while notifies:
                    notify = notifies.pop()
                    article_id = int(notify.payload)
                    try:
                        article = DB_get_full_article(cur, article_id)
                        if article is None:
                            print "skipping %s (not found)" % article_id
                            continue
                        elif not article['cleartext']:
                            print "skipping %s (no cleartext)" % article_id
                            continue
                        sock.send_pyobj(article)
                        print "ok %s" % article_id + (
                            '(old)'
                            if article['found_date'].year < 2012 else '')
                    except:
                        print "!!! error while processing %s" % article_id
                        traceback.print_exc()
    except:
        traceback.print_exc()
        return
    finally:
        sock.close()
        zmqctx.term()
def main():
	global article, conn
	from cleanDb import openConnection
	conn, cur = openConnection('cleartext feed')
	conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)	# disable transactions, so we get notifys realtime
	cur_listen = conn.cursor()
	cur_listen.execute("LISTEN have_cleartext")
  
	zmqctx = zmq.Context()
	sock = zmqctx.socket(zmq.PUB)
	sock.setsockopt(zmq.HWM, 100)
	sock.bind('tcp://*:13371')

	try:
		while True:
			if select.select([conn],[],[],5) == ([],[],[]):
				print '(nothing to do)'
			else:
				conn.poll()
				notifies = conn.notifies
				while notifies:
					notify = notifies.pop()
					article_id = int(notify.payload)
					try:
						article = DB_get_full_article(cur, article_id)
						if article is None:
							print "skipping %s (not found)" % article_id
							continue
						elif not article['cleartext']:
							print "skipping %s (no cleartext)" % article_id
							continue
						sock.send_pyobj(article)
						print "ok %s" % article_id + ('(old)' if article['found_date'].year<2012 else '')
					except:
						print "!!! error while processing %s" % article_id
						traceback.print_exc()
	except:
		traceback.print_exc()
		return
	finally:
		sock.close()
		zmqctx.term()
Example #6
0
def main():
    db, cur = cleanDb.openConnection()

    dest_dir = sys.argv[1]
    languages = {
        lang: (liso, lname, set(tlds.split(',')))
        for lang, liso, lname, tlds in [x.split(':') for x in sys.argv[2:]]
    }

    tbl = open(dest_dir + '/table.txt', 'w')
    for lang in languages:
        liso, lname, tlds = languages[lang]
        print 'begin: ', lang, tlds  # tlds = set of tlds, lang = lang_altcode
        Qparams = {
            'tlds':
            tuple(tlds),
            'excluded':
            tuple(excluded_languages - {lang} - known_fails.get(lang, set())),
            'limit':
            1000
        }
        print "Q params", Qparams
        cur.execute(Q, Qparams)
        print cur.rowcount, "rows"
        s = cur.fetchone()[0]
        s = re.sub(r"(\s|[0-9])+", " ", s, flags=re.MULTILINE)
        print 'strlen =', len(s)
        #print s
        #raw_input()

        tgms = defaultdict(int)
        for tgm in trigrams(s):
            tgms[tgm] += 1
        #print tgms

        tbl.write('%s\t%s\t%s\n' % (lang, liso, lname))
        codecs.open(
            (dest_dir + '/%s-3grams.txt') % lang, 'w',
            encoding='utf-8').write('\n'.join("%d %s" % (tgms[tgm], tgm)
                                              for tgm in tgms))

        print 'end'
Example #7
0
reload(zmq2zmq_enrych)
from zmq2zmq_enrych import is_enrychable, enrycher_worker
import db2zmq_cleartext2

reload(db2zmq_cleartext2)
from db2zmq_cleartext2 import DB_get_full_article
import serialize2

reload(serialize2)
from iso_map import iso2to3

DB_OUT = '/tmp/deleteme.sqlite'  # path to the sqlite DB
MAX_ENRYCHER_REQUESTS = 2  # max number of simultaneous requests

conn_in, cur_in = openConnection('history_export')
cur_in_ids = conn_in.cursor('foo')  # named cursor for incremental fetching
conn_out = sqlite3.connect(DB_OUT)
conn_out.isolation_level = None
cur_out = conn_out.cursor()

# set up the output DB
cur_out.execute(
    "CREATE TABLE IF NOT EXISTS news (article_id integer primary key, story_id text, xml text);"
)
cur_out.execute(
    "CREATE INDEX IF NOT EXISTS article_id_idx ON news(article_id);")

# fetch IDs to export
#cur_in_ids.execute("SELECT feed_articleid, story_id FROM feed_article_googles g JOIN feed_article fa ON (fa.id=g.feed_articleid) WHERE fa.id>45909130 AND fa.found BETWEEN '2012-04-01' AND '2012-05-01' ORDER BY id")  # 537466 articles; not taking lang into account
#cur_in_ids.execute("SELECT feed_articleid, story_id FROM feed_article_googles")
"""
Get a sample of articles from the news DB, show language distribution according to
 - existing 'lang' column in the DB
 - google's CLD (executed on the fly for each article)
"""

import os, sys

sys.path.extend((".", ".."))
from cleanDb import openConnection
import cld

conn, cur = openConnection()
cur = conn.cursor("x")
cur.execute(
    "SELECT m.id, p.content, m.lang_altcode FROM processed_article p JOIN feed_article_meta m ON (p.feed_articleid = m.id) WHERE p.mode='cleartext' ORDER BY m.id DESC LIMIT 100000"
)

cnt = {}
cnt2 = {}
while True:
    row = cur.fetchone()
    if not row:
        break
    aid, txt, lang = row
    lang = str(lang[:2])
    lang2 = cld.detect(txt.encode("utf8", "ignore"))[1]
    cnt[lang] = cnt.get(lang, 0) + 1
    cnt2[lang2] = cnt2.get(lang2, 0) + 1
    print "done", sum(cnt.itervalues())
Example #9
0
"""
Get a sample of articles from the news DB, show language distribution according to
 - existing 'lang' column in the DB
 - google's CLD (executed on the fly for each article)
"""

import os, sys

sys.path.extend(('.', '..'))
from cleanDb import openConnection
import cld

conn, cur = openConnection()
cur = conn.cursor('x')
cur.execute(
    "SELECT m.id, p.content, m.lang_altcode FROM processed_article p JOIN feed_article_meta m ON (p.feed_articleid = m.id) WHERE p.mode='cleartext' ORDER BY m.id DESC LIMIT 100000"
)

cnt = {}
cnt2 = {}
while True:
    row = cur.fetchone()
    if not row: break
    aid, txt, lang = row
    lang = str(lang[:2])
    lang2 = cld.detect(txt.encode('utf8', 'ignore'))[1]
    cnt[lang] = cnt.get(lang, 0) + 1
    cnt2[lang2] = cnt2.get(lang2, 0) + 1
    print 'done', sum(cnt.itervalues())

print 'done'
from Queue import Queue

import cld

from cleanDb import openConnection
import zmq2zmq_enrych; reload(zmq2zmq_enrych)
from zmq2zmq_enrych import is_enrychable, enrycher_worker
import db2zmq_cleartext2; reload(db2zmq_cleartext2)
from db2zmq_cleartext2 import DB_get_full_article
import serialize2; reload(serialize2)
from iso_map import iso2to3

DB_OUT = '/tmp/deleteme.sqlite'  # path to the sqlite DB
MAX_ENRYCHER_REQUESTS = 2     # max number of simultaneous requests

conn_in, cur_in = openConnection('history_export')
cur_in_ids = conn_in.cursor('foo')  # named cursor for incremental fetching
conn_out = sqlite3.connect(DB_OUT); conn_out.isolation_level = None; cur_out = conn_out.cursor();

# set up the output DB
cur_out.execute("CREATE TABLE IF NOT EXISTS news (article_id integer primary key, story_id text, xml text);")
cur_out.execute("CREATE INDEX IF NOT EXISTS article_id_idx ON news(article_id);")

# fetch IDs to export
#cur_in_ids.execute("SELECT feed_articleid, story_id FROM feed_article_googles g JOIN feed_article fa ON (fa.id=g.feed_articleid) WHERE fa.id>45909130 AND fa.found BETWEEN '2012-04-01' AND '2012-05-01' ORDER BY id")  # 537466 articles; not taking lang into account
#cur_in_ids.execute("SELECT feed_articleid, story_id FROM feed_article_googles")
#cur_in_ids.execute("SELECT feed_articleid, story_id FROM feed_article_googles g JOIN feed_article fa ON (fa.id=g.feed_articleid) LIMIT 10 OFFSET 10")
#cur_in_ids.execute("SELECT 20932789 AS feed_articleid, 'FAKE_STORY' as story_id")
cur_in_ids.execute("SELECT feed_articleid, story_id FROM feed_article_googles g JOIN feed_article fa ON (fa.id=g.feed_articleid) ORDER BY fa.id DESC LIMIT 1000 OFFSET 1000")
id_rows = cur_in_ids.fetchall()
conn_in.commit()