Example #1
0
 def fixEncodingError(self, text):
     # try to fix
     # -  -> ー
     # ~  ->  ~
     fromChars = '-~〝〟'
     toChars = 'ー~""'
     if isPy2():
         table = self.maketransU(fromChars, toChars)
         pos = 0
         while pos < len(text):
             ch = table.get(ord(text[pos]), None)
             if ch is not None:
                 text = text[:pos] + unichr(ch) + text[pos+1:]
             pos += 1
         return text
     else:
         table = text_type.maketrans(fromChars, toChars)
         text = text.translate(table)
     return text
     # detect errors
     try:
         bytearray(text, 'euc-jp')
     except UnicodeEncodeError as u:
         raise RuntimeError(text + ': ' + str(e))
     # ignore
     if False:
         b = bytearray(text, 'euc-jp', 'ignore')
         return text_type(b)
Example #2
0
def getPartOfSpeech():
    runner = MecabRunner('%m,%f[7]')
    res = runner.run('雨が降っていたん')
    #res = runner.run('海泡石')
    for line in res:
        if not isPy2():
            print(''.join(line))
Example #3
0
def dumpNodeInfo():
    runner = MecabOutputGetter()
    #z = bytearray('-・', 'euc-jp', "ignore")
    res = runner.run('雨が降っていたん')
   # res = runner.run('すべてに滲《し》み込み')
    for line in res:
        if not isPy2():
            print(' '.join(line))
Example #4
0
 def getReadingAndDefinition(self, word):
     c = self.__conn.cursor()
     if isPy2():
         word = word.encode('utf-8')
     c.execute("select kana, entry from dict where kanji=:what order by kanji", {"what": word})
     result = c.fetchone()
     if result:
         return result[0], result[1]
     else:
         return None, None
Example #5
0
def getUniqueCSVList(textProc, contents, deckFileName, tag):
    if deckFileName:
        deck = DeckWords(deckFileName)
    else:
        deck = None
    if tag is None:
        tag = ''
    allWords = set()
    for word, startPos, reading, definition, sentence in textProc.do(contents, Settings.NoExcessiveReading(), True):
        if word in allWords or not definition  or deck and deck.isInDeck(word):
            continue
        else:
            allWords.add(word)
        line = text_type('"{0:}";"{1:}";"{2:}";"{3}";"{4}"').format(word, reading, definition,sentence, tag)
        if isPy2():
            print(line.encode('utf-8'))
        else:
            print(line)
Example #6
0
def main():
    parser = argparse.ArgumentParser(description='Get the list word in the text.')
    parser.add_argument('inputfile', metavar='input file name',
                   help='input file name')
    parser.add_argument('-d', metavar='deck file name', required=False,
                   help='deck file nime')
    parser.add_argument('-t', metavar='tag', required=False,
                   help='optional tag appended to the list')
    parser.add_argument('-o', metavar='output file name', required=False,
                   help='output file name')
    args = parser.parse_args()
    if args.o:
        sys.stdout = open(args.o, 'w', encoding='utf-8')

    setupLogger()
    with openInputFile(args.inputfile) as file:
        contents = file.read()
        if isPy2():
            contents = unicode(contents, 'utf-8')
        textProc = TextProcessor(getDataLoader())
        getUniqueCSVList(textProc, contents, args.d, args.t)
Example #7
0
def openOutputFile(fileName):
    if isPy2():
        return open(fileName, 'w')
    else:
        return open(fileName, 'w', encoding='utf-8')
Example #8
0
def openInputFile(fileName):
    if isPy2():
        return open(fileName, 'r')
    else:
        return open(fileName, 'r', encoding='utf-8')