Esempio n. 1
0
def addJLPTLevels(dbFile, jlptFile, level, outFileName):
    global db, dbConnectionName

    outFile = codecs.open(outFileName, 'w', 'utf-8')

    db = getDb(dbConnectionName)
    db.setDatabaseName(dbFile)
    if not db.open():
        usage()
        sys.exit(1)
    query = QSqlQuery(db)

    re1 = QtCore.QRegExp(u'~?([\w]+)~?\s+\\[?~?([\w]+)~?\\]?.*')
    re2 = QtCore.QRegExp(u'~?([\w]+)~?.*')
    file = open(jlptFile, "r")
    for line in file.xreadlines():
        if len(line.strip()) == 0 or line[0] == '#': continue
        s = QtCore.QString.fromUtf8(line)
        if re1.exactMatch(s):
            matches = re1.capturedTexts()
            kanji = unicode(matches[1])
            kana = unicode(matches[2])
            qString = "select kana.id from kanaText join kana on kana.docid = kanaText.docid join kanji on kana.id = kanji.id join kanjiText on kanji.docid = kanjiText.docid join entries on entries.id = kana.id where kanaText.reading match ? and kanjiText.reading match ? order by kanji.priority asc, entries.frequency desc"
            query.prepare(qString)
            query.addBindValue(QtCore.QVariant(kana))
            query.addBindValue(QtCore.QVariant(kanji))
        elif re2.exactMatch(s):
            matches = re2.capturedTexts()
            kanji = None
            kana = unicode(matches[1])
            qString = "select kana.id from kanaText join kana on kana.docid = kanaText.docid join entries on entries.id = kana.id where kanaText.reading match ? order by entries.frequency desc"
            query.prepare(qString)
            query.addBindValue(QtCore.QVariant(kana))
        else:
            print "Unmatched line: " + line,
            continue

        if not query.exec_():
            print query.lastError().text()
            raise Exception
        if not query.next():
            print "No result for %s %s" % (kanji, kana)
            continue
        outFile.write("%d\n" % (query.value(0).toInt()[0]))
        if query.next(): print "Multiple results for %s %s" % (kanji, kana)
Esempio n. 2
0
def addJLPTLevels(dbFile, jlptFile, level, outFileName):
	global db, dbConnectionName

	outFile = codecs.open(outFileName, 'w', 'utf-8')

	db = getDb(dbConnectionName)
	db.setDatabaseName(dbFile)
	if not db.open():
		usage()
		sys.exit(1)
	query = QSqlQuery(db)

	re1 = QtCore.QRegExp(u'~?([\w]+)~?\s+\\[?~?([\w]+)~?\\]?.*')
	re2 = QtCore.QRegExp(u'~?([\w]+)~?.*')
	file = open(jlptFile, "r")
	for line in file.xreadlines():
		if len(line.strip()) == 0 or line[0] == '#': continue
		s = QtCore.QString.fromUtf8(line)
		if re1.exactMatch(s):
			matches = re1.capturedTexts()
			kanji = unicode(matches[1])
			kana = unicode(matches[2])
			qString = "select kana.id from kanaText join kana on kana.docid = kanaText.docid join kanji on kana.id = kanji.id join kanjiText on kanji.docid = kanjiText.docid join entries on entries.id = kana.id where kanaText.reading match ? and kanjiText.reading match ? order by kanji.priority asc, entries.frequency desc"
			query.prepare(qString)
			query.addBindValue(QtCore.QVariant(kana))
			query.addBindValue(QtCore.QVariant(kanji))
		elif re2.exactMatch(s):
			matches = re2.capturedTexts()
			kanji = None
			kana = unicode(matches[1])
			qString = "select kana.id from kanaText join kana on kana.docid = kanaText.docid join entries on entries.id = kana.id where kanaText.reading match ? order by entries.frequency desc"
			query.prepare(qString)
			query.addBindValue(QtCore.QVariant(kana))
		else:
			print "Unmatched line: " + line,
			continue

		if not query.exec_(): print query.lastError().text(); raise Exception
		if not query.next(): print "No result for %s %s" % (kanji, kana); continue
		outFile.write("%d\n" % (query.value(0).toInt()[0]))
		if query.next(): print "Multiple results for %s %s" % (kanji, kana)
Esempio n. 3
0
def addJLPTLevels(dbFile, jlptFile, level, outFile):
	global db, dbConnectionName

	outFile = codecs.open(outFile, "w", "utf-8")

	db = getDb(dbConnectionName)
	db.setDatabaseName(dbFile)
	if not db.open():
		usage()
		sys.exit(1)
	query = QSqlQuery(db)
	
	nbpos = 0
	nben = 0

	levelRegExp = re.compile('''.*<td class="(.+)"(?:>| />)([^<$\n]*).*''')
	file = open(jlptFile, "r")
	for line in file.xreadlines():
		match = levelRegExp.match(line)
		if match:
			if match.group(1) == "pos":
				kana = None
				kanji = None
				en = None
				nbpos += 1
			elif match.group(1) == "kanji":
				if kana == None:
					kana = QtCore.QString.fromUtf8(match.group(2))
					kana.remove(u"~")
					kana.remove(u"。")
					kana.remove(u"・")
					kana.remove(u"、")
					idx = kana.indexOf("/")
					if idx >= 0: kana.truncate(idx)
#					idx = kana.indexOf(u"・")
#					if idx >= 0: kana.truncate(idx)
					kana.remove(QtCore.QRegExp(u"(.*)"))
					kana.remove(QtCore.QRegExp(u"\(.*\)"))
					kana.remove(QtCore.QRegExp(u"する$"))
				else:
					kanji = QtCore.QString.fromUtf8(match.group(2))
					kanji.remove(u"~")
					idx = kanji.indexOf("/")
					if idx >= 0: kanji.truncate(idx)
			elif match.group(1) == "en":
				nben += 1
				en = QtCore.QString.fromUtf8(match.group(2))
				if kana != None:
					qString = "select kana.id from kanaText join kana on kana.docid = kanaText.docid join kanji on kana.id = kanji.id join kanjiText on kanji.docid = kanjiText.docid join entries on entries.id = kana.id where kanaText.reading match ? and kanjiText.reading match ? order by entries.frequency desc"
					query.prepare(qString)
					query.addBindValue(QtCore.QVariant(kana))
					query.addBindValue(QtCore.QVariant(kanji))
					if not query.exec_(): print query.lastError().text(); raise Exception
					# No result? Let's try by looking at the kana and definition then.
					if not query.next():
						query.prepare("select kana.id from kanaText join kana on kana.docid = kanaText.docid join gloss on gloss.id = kana.id join glossText on gloss.docid = glossText.docid join entries on entries.id = kana.id where kanaText.reading match ? and glossText.reading match ? order by entries.frequency desc")
						query.addBindValue(QtCore.QVariant(kana))
						query.addBindValue(QtCore.QVariant(en))
						if not query.exec_(): print query.lastError().text(); raise Exception
						if not query.next():
							# Still not? Then try kana and accept if there is only one entry corresponding
							query.prepare("select kana.id from kanaText join kana on kana.docid = kanaText.docid join entries on entries.id = kana.id where kanaText.reading match ? order by entries.frequency desc")
							query.addBindValue(QtCore.QVariant(kana))
							if not query.exec_(): print query.lastError().text(); raise Exception
							if not query.next():
								outFile.write("############%s %s\n" % (unicode(kanji), unicode(kana)))
								continue
							if query.next():
								outFile.write("############%s %s\n" % (unicode(kanji), unicode(kana)))
								continue
							query.first() 

					# Make sure not to violate primary key constraint
					#query2.prepare("insert or ignore into jlpt values (?, %d)" % (4))
					#query2.addBindValue(QtCore.QVariant(query.value(0)))
					#if not query2.exec_(): print query2.lastError().text(); raise Exception
					outFile.write("%d\n" % (query.value(0).toInt()[0]))
			else: print "Bad entry: %s" % (match.group(1))
	print nbpos, nben
Esempio n. 4
0
def addJLPTLevels(dbFile, jlptFile, level, outFile):
    global db, dbConnectionName

    outFile = codecs.open(outFile, "w", "utf-8")

    db = getDb(dbConnectionName)
    db.setDatabaseName(dbFile)
    if not db.open():
        usage()
        sys.exit(1)
    query = QSqlQuery(db)

    nbpos = 0
    nben = 0

    levelRegExp = re.compile('''.*<td class="(.+)"(?:>| />)([^<$\n]*).*''')
    file = open(jlptFile, "r")
    for line in file.xreadlines():
        match = levelRegExp.match(line)
        if match:
            if match.group(1) == "pos":
                kana = None
                kanji = None
                en = None
                nbpos += 1
            elif match.group(1) == "kanji":
                if kana == None:
                    kana = QtCore.QString.fromUtf8(match.group(2))
                    kana.remove(u"~")
                    kana.remove(u"。")
                    kana.remove(u"・")
                    kana.remove(u"、")
                    idx = kana.indexOf("/")
                    if idx >= 0: kana.truncate(idx)
                    #					idx = kana.indexOf(u"・")
                    #					if idx >= 0: kana.truncate(idx)
                    kana.remove(QtCore.QRegExp(u"(.*)"))
                    kana.remove(QtCore.QRegExp(u"\(.*\)"))
                    kana.remove(QtCore.QRegExp(u"する$"))
                else:
                    kanji = QtCore.QString.fromUtf8(match.group(2))
                    kanji.remove(u"~")
                    idx = kanji.indexOf("/")
                    if idx >= 0: kanji.truncate(idx)
            elif match.group(1) == "en":
                nben += 1
                en = QtCore.QString.fromUtf8(match.group(2))
                if kana != None:
                    qString = "select kana.id from kanaText join kana on kana.docid = kanaText.docid join kanji on kana.id = kanji.id join kanjiText on kanji.docid = kanjiText.docid join entries on entries.id = kana.id where kanaText.reading match ? and kanjiText.reading match ? order by entries.frequency desc"
                    query.prepare(qString)
                    query.addBindValue(QtCore.QVariant(kana))
                    query.addBindValue(QtCore.QVariant(kanji))
                    if not query.exec_():
                        print query.lastError().text()
                        raise Exception
                    # No result? Let's try by looking at the kana and definition then.
                    if not query.next():
                        query.prepare(
                            "select kana.id from kanaText join kana on kana.docid = kanaText.docid join gloss on gloss.id = kana.id join glossText on gloss.docid = glossText.docid join entries on entries.id = kana.id where kanaText.reading match ? and glossText.reading match ? order by entries.frequency desc"
                        )
                        query.addBindValue(QtCore.QVariant(kana))
                        query.addBindValue(QtCore.QVariant(en))
                        if not query.exec_():
                            print query.lastError().text()
                            raise Exception
                        if not query.next():
                            # Still not? Then try kana and accept if there is only one entry corresponding
                            query.prepare(
                                "select kana.id from kanaText join kana on kana.docid = kanaText.docid join entries on entries.id = kana.id where kanaText.reading match ? order by entries.frequency desc"
                            )
                            query.addBindValue(QtCore.QVariant(kana))
                            if not query.exec_():
                                print query.lastError().text()
                                raise Exception
                            if not query.next():
                                outFile.write("############%s %s\n" %
                                              (unicode(kanji), unicode(kana)))
                                continue
                            if query.next():
                                outFile.write("############%s %s\n" %
                                              (unicode(kanji), unicode(kana)))
                                continue
                            query.first()

                    # Make sure not to violate primary key constraint
                    #query2.prepare("insert or ignore into jlpt values (?, %d)" % (4))
                    #query2.addBindValue(QtCore.QVariant(query.value(0)))
                    #if not query2.exec_(): print query2.lastError().text(); raise Exception
                    outFile.write("%d\n" % (query.value(0).toInt()[0]))
            else:
                print "Bad entry: %s" % (match.group(1))
    print nbpos, nben