def _getKeyCharValue(self, event): key = None char = '' ucat = '' # Get char value # result = self._user32.ToUnicode(event.KeyID, event.ScanCode, ctypes.byref(self._keyboard_state), ctypes.byref(self._unichar), 8, 0) if result > 0: char = self._unichar[result - 1].encode('utf-8') ucat = ucategory(self._unichar[result - 1]) # Get .key value # if event.Key in numpad_key_value_mappings: key = numpad_key_value_mappings[event.Key] elif ucat.lower() != 'cc': prev_shift = self._keyboard_state[win32_vk.VK_SHIFT] prev_numlock = self._keyboard_state[win32_vk.VK_NUM_LOCK] prev_caps = self._keyboard_state[win32_vk.VK_CAPITAL] self._keyboard_state[win32_vk.VK_SHIFT] = 0 self._keyboard_state[win32_vk.VK_NUM_LOCK] = 0 result = self._user32.ToUnicode(event.KeyID, event.ScanCode, ctypes.byref(self._keyboard_state), ctypes.byref(self._unichar), 8, 0) self._keyboard_state[win32_vk.VK_SHIFT] = prev_shift self._keyboard_state[win32_vk.VK_NUM_LOCK] = prev_numlock self._keyboard_state[win32_vk.VK_CAPITAL] = prev_caps if result > 0: key = self._unichar[result - 1].encode('utf-8') if key is None: key = KeyboardConstants._getKeyName(event) if isinstance(key, bytes): key = str(key, 'utf-8') if isinstance(char, bytes): char = str(char, 'utf-8') key = key.lower() # misc. char value cleanup. if key == 'return': char = '\n'.encode('utf-8') elif key in ('escape', 'backspace'): char = '' if Keyboard.use_psychopy_keymap and key in psychopy_key_mappings.keys(): key = psychopy_key_mappings[key] # win32 specific handling of keypad / and - keys if event.Key == 'Subtract': key = 'num_subtract' elif event.Key == 'Divide': key = 'num_divide' return key, char
def letters(): start = end = None result = [] for index in range(maxunicode + 1): c = chr(index) if ucategory(c)[0] == 'L': if start is None: start = end = c else: end = c elif start: if start == end: result.append(start) else: result.append(start + "-" + end) start = None return ''.join(result)
def _getKeyNameForEvent(self, ns_event): key_code = ns_event.keyCode() ucode = 0 key_name = ns_event.characters() if key_name and len(key_name) > 0: ucode = ord(key_name) #print2err('characters hit: [',key_name, '] ', ucode, ' ', len(key_name)) #print2err("characters ucategory: ",ucategory(unichr(ucode))) if ucode == 0 or ucategory(unichr(ucode))[0] == 'C': key_name = ns_event.charactersIgnoringModifiers() if key_name and len(key_name) > 0: ucode = ord(key_name) #print2err(" charactersIgnoringModifiers ucategory: ",ucategory(unichr(ucode))) #print2err('charactersIgnoringModifiers hit: [',key_name, '] ', ord(key_name[-1]), ' ', len(key_name)) if ucode != 0: umac_key_name = KeyboardConstants._unicodeChars.getName(ucode) if umac_key_name and len(umac_key_name) > 0: if umac_key_name.startswith('VK_'): umac_key_name = umac_key_name[3:] key_name = u'' + umac_key_name #ucode=ord(key_name[-1]) # print2err('mac ucode hit: [',key_name, '] ', ucode, ' ', len(key_name)) if key_name is None or len( key_name) == 0: # or ucategory(unichr(ucode))[0] == 'C': key_name = KeyboardConstants._virtualKeyCodes.getName(key_code) if key_name: if key_name.startswith('VK_'): key_name = key_name[3:] key_name = (u'' + key_name) if not key_name: amac_key_name = KeyboardConstants._ansiKeyCodes.getName( key_code) if amac_key_name and len(amac_key_name) > 0: key_name = amac_key_name if key_name.startswith('ANSI_'): key_name = key_name[5:] key_name = u'' + key_name return key_name, ucode, key_code
def _getKeyCharValue(self, event): key = None char = '' ucat = '' # Get char value # result = self._user32.ToUnicode(event.KeyID, event.ScanCode, ctypes.byref(self._keyboard_state), ctypes.byref(self._unichar), 8, 0) if result > 0: char = self._unichar[result - 1].encode('utf-8') ucat = ucategory(self._unichar[result - 1]) # Get .key value # if event.Key in numpad_key_value_mappings: key = numpad_key_value_mappings[event.Key] elif ucat.lower() != 'cc': prev_shift = self._keyboard_state[win32_vk.VK_SHIFT] prev_numlock = self._keyboard_state[win32_vk.VK_NUM_LOCK] prev_caps = self._keyboard_state[win32_vk.VK_CAPITAL] self._keyboard_state[win32_vk.VK_SHIFT] = 0 self._keyboard_state[win32_vk.VK_NUM_LOCK] = 0 result = self._user32.ToUnicode(event.KeyID, event.ScanCode, ctypes.byref(self._keyboard_state), ctypes.byref(self._unichar), 8, 0) self._keyboard_state[win32_vk.VK_SHIFT] = prev_shift self._keyboard_state[win32_vk.VK_NUM_LOCK] = prev_numlock self._keyboard_state[win32_vk.VK_CAPITAL] = prev_caps if result > 0: key = self._unichar[result - 1].encode('utf-8') if key is None: key = KeyboardConstants._getKeyName(event) # misc. char value cleanup. if key == 'return': char = '\n'.encode('utf-8') elif key in ('escape', 'backspace'): char = '' return key.lower(), char
def _getKeyCharValue(self, event): key = None char = '' ucat = '' # Get char value # result = self._user32.ToUnicode(event.KeyID, event.ScanCode, ctypes.byref(self._keyboard_state), ctypes.byref(self._unichar), 8, 0) if result > 0: char = self._unichar[result-1].encode('utf-8') ucat = ucategory(self._unichar[result-1]) # Get .key value # if event.Key in numpad_key_value_mappings: key = numpad_key_value_mappings[event.Key] elif ucat.lower() != 'cc': prev_shift = self._keyboard_state[win32_vk.VK_SHIFT] prev_numlock = self._keyboard_state[win32_vk.VK_NUM_LOCK] prev_caps = self._keyboard_state[win32_vk.VK_CAPITAL] self._keyboard_state[win32_vk.VK_SHIFT] = 0 self._keyboard_state[win32_vk.VK_NUM_LOCK] = 0 result = self._user32.ToUnicode(event.KeyID, event.ScanCode, ctypes.byref(self._keyboard_state), ctypes.byref(self._unichar), 8, 0) self._keyboard_state[win32_vk.VK_SHIFT] = prev_shift self._keyboard_state[win32_vk.VK_NUM_LOCK] = prev_numlock self._keyboard_state[win32_vk.VK_CAPITAL] = prev_caps if result > 0: key = self._unichar[result-1].encode('utf-8') if key is None: key = KeyboardConstants._getKeyName(event) # misc. char value cleanup. if key == 'return': char = '\n'.encode('utf-8') elif key in ('escape','backspace'): char = '' return key.lower(), char
def _getKeyNameForEvent(self,ns_event): key_code=ns_event.keyCode() ucode=0 key_name=ns_event.characters() if key_name and len(key_name)>0: ucode=ord(key_name) #print2err('characters hit: [',key_name, '] ', ucode, ' ', len(key_name)) #print2err("characters ucategory: ",ucategory(unichr(ucode))) if ucode == 0 or ucategory(unichr(ucode))[0] == 'C': key_name=ns_event.charactersIgnoringModifiers() if key_name and len(key_name)>0: ucode=ord(key_name) #print2err(" charactersIgnoringModifiers ucategory: ",ucategory(unichr(ucode))) #print2err('charactersIgnoringModifiers hit: [',key_name, '] ', ord(key_name[-1]), ' ', len(key_name)) if ucode != 0: umac_key_name=KeyboardConstants._unicodeChars.getName(ucode) if umac_key_name and len(umac_key_name)>0: if umac_key_name.startswith('VK_'): umac_key_name=umac_key_name[3:] key_name=u''+umac_key_name #ucode=ord(key_name[-1]) # print2err('mac ucode hit: [',key_name, '] ', ucode, ' ', len(key_name)) if key_name is None or len(key_name)==0:# or ucategory(unichr(ucode))[0] == 'C': key_name=KeyboardConstants._virtualKeyCodes.getName(key_code) if key_name: if key_name.startswith('VK_'): key_name=key_name[3:] key_name=(u''+key_name) if not key_name: amac_key_name=KeyboardConstants._ansiKeyCodes.getName(key_code) if amac_key_name and len(amac_key_name)>0: key_name=amac_key_name if key_name.startswith('ANSI_'): key_name=key_name[5:] key_name=u''+key_name return key_name,ucode,key_code
def OnChar(self, event): """ An event handler which intercepts the char events before the control is updated and runs the new string through the validator. """ # We only need to validate the string if the key pressed can # actually have an effect on the displayable text. The following # clause simply filters any control keys which generate char # events, such as the arrow keys. uchar = unichr(event.GetUnicodeKey()) if ucategory(uchar).startswith('C'): event.Skip() return # We only skip the char event if the new text validates as # INTERMEDIATE or ACCEPTABLE. Otherwise, we kill the event # so that the control is not visibly updated. v = self.validator window = self.GetWindow() current = window.GetValue() idx = window.GetInsertionPoint() new = current[:idx] + uchar + current[idx:] if v.validate(new) != v.INVALID: event.Skip()
def _getIOHubEventObject(self,native_event_data): try: if len(native_event_data) >2: # it is a KeyboardCharEvent return native_event_data # stime=getTime() notifiedTime, event=native_event_data etype = event.Type # # Start Tracking Modifiers that are pressed # keyID=event.KeyID modKeyName=Keyboard._win32_modifier_mapping.get(keyID,None) if modKeyName: mod_value=KeyboardConstants._modifierCodes.getID(modKeyName) if keyID == win32_vk.VK_CAPITAL and etype==EventConstants.KEYBOARD_PRESS: if self._keyboard_state[keyID] > 0: self._keyboard_state[keyID]=0 ioHubKeyboardDevice._modifier_value-=mod_value else: self._keyboard_state[keyID] = 0x01 ioHubKeyboardDevice._modifier_value+=mod_value elif etype==EventConstants.KEYBOARD_PRESS and self._keyboard_state[keyID]==0: self._keyboard_state[keyID] = 0x80 ioHubKeyboardDevice._modifier_value+=mod_value #print2err("SETTING MOD KEY ELEMENT: ",keyID,' state: ',pyHook.GetKeyState(keyID)) if keyID in [win32_vk.VK_LSHIFT,win32_vk.VK_RSHIFT] and self._keyboard_state[win32_vk.VK_SHIFT]==0: self._keyboard_state[win32_vk.VK_SHIFT] = 0x80 #print2err("SETTING shift ",keyID) elif keyID in [win32_vk.VK_LCONTROL,win32_vk.VK_RCONTROL] and self._keyboard_state[win32_vk.VK_CONTROL] ==0: self._keyboard_state[win32_vk.VK_CONTROL] = 0x80 #print2err("SETTING CTRL: ",keyID) elif keyID in [win32_vk.VK_LMENU,win32_vk.VK_RMENU] and self._keyboard_state[win32_vk.VK_MENU]==0: self._keyboard_state[win32_vk.VK_MENU] = 0x80 #print2err("SETTING VK_MENU: ",keyID) elif etype==EventConstants.KEYBOARD_RELEASE and keyID != win32_vk.VK_CAPITAL: if self._keyboard_state[keyID]!=0 and keyID != win32_vk.VK_CAPITAL: ioHubKeyboardDevice._modifier_value-=mod_value self._keyboard_state[keyID] = 0 #print2err("clearing MOD KEY ELEMENT: ",keyID,' state: ',pyHook.GetKeyState(keyID)) if modKeyName.find('SHIFT')>=0 and self._keyboard_state[win32_vk.VK_LSHIFT]==0 and self._keyboard_state[win32_vk.VK_RSHIFT]==0: self._keyboard_state[win32_vk.VK_SHIFT] = 0 # print2err("CLEAR VK_SHIFT: ",keyID) elif modKeyName.find('CONTROL')>=0 and self._keyboard_state[win32_vk.VK_LCONTROL]==0 and self._keyboard_state[win32_vk.VK_RCONTROL]==0: self._keyboard_state[win32_vk.VK_CONTROL] = 0 #print2err("CLEAR VK_CONTROL: ",keyID) elif modKeyName.find('ALT')>=0 and self._keyboard_state[win32_vk.VK_LMENU]==0 and self._keyboard_state[win32_vk.VK_RMENU]==0: self._keyboard_state[win32_vk.VK_MENU] = 0 #print2err("CLEAR VK_MENU: ",keyID) # # End Tracking Modifiers that are pressed # if ioHubKeyboardDevice._modifier_value is None: ioHubKeyboardDevice._modifier_value=0 event.Modifiers=ioHubKeyboardDevice._modifier_value # From MSDN: http://msdn.microsoft.com/en-us/library/windows/desktop/ms644939(v=vs.85).aspx # The time is a long integer that specifies the elapsed time, in milliseconds, from the time the system was started to the time the message was # created (that is, placed in the thread's message queue).REMARKS: The return value from the GetMessageTime function does not necessarily increase # between subsequent messages, because the value wraps to zero if the timer count exceeds the maximum value for a long integer. To calculate time # delays between messages, verify that the time of the second message is greater than the time of the first message; then, subtract the time of the # first message from the time of the second message. device_time = event.Time/1000.0 # convert to sec time = notifiedTime #TODO correct kb times to factor in delay if possible. confidence_interval=0.0 # since this is a keyboard device using a callback method, confidence_interval is not applicable delay=0.0 # since this is a keyboard, we 'know' there is a delay, but until we support setting a delay in the device properties based on external testing for a given keyboard, we will leave at 0. # ## check for unicode char # # uchar holds the unicode ord() number for the unicode char. unichr(uchar) == u'x' unicode str uchar=0 # key holds the unicode char, in 8 bit string format encoded in UTF-8. # Safe to transmit and to store in pytables. Use key.decode('utf-8') to get the unicode sybol decoded into a unicode string. key=None # ucat holds the unicode character category. This can be used to tell what king of code point it is. # For a list of categories see http://www.unicode.org/reports/tr44/#General_Category_Values # examples: # u'v' -> category: Ll # u'<-' (ESCAPE) -> category: Cc # u' ' -> category: Zs ucat=None result=self._user32.ToUnicode(event.KeyID, event.ScanCode,ctypes.byref(self._keyboard_state),ctypes.byref(self._unichar),8,0) if result > 0: if result == 1: # print2err("self._unichar[0]: ",self._unichar[0], " ", type(self._unichar[0])) # print2err(u"á: ",u"á", " upper: ",u"á".upper()) # print2err(u"': ",u"'", " upper: ",u"'".upper()) key=self._unichar[0].encode('utf-8') uchar=ord(self._unichar[0]) ucat=ucategory(self._unichar[0]) else: key=u'' for c in range(result): uchar=ord(self._unichar[c]) ucat=ucategory(self._unichar[c]) key=self._unichar[0:result] key=key.encode('utf-8') elif result == -1: # The specified virtual key is a dead-key character (accent or diacritic). # This value is returned regardless of the keyboard layout, even if several # characters have been typed and are stored in the keyboard state. If possible, # even with Unicode keyboard layouts, the function has written a spacing version # of the dead-key character to the buffer specified by pwszBuff. For example, # the function writes the character SPACING ACUTE (0x00B4), # rather than the character NON_SPACING ACUTE (0x0301). key=self._unichar[0].encode('utf-8') uchar=ord(self._unichar[0]) ucat=ucategory(self._unichar[0]) if result==0 or ucat and ucat[0]=='C': # ## TODO , TO FIX: We should only bother looking in the hardcoded character label lookup tables if the key did not result in ## a unicode char being returned, or if the unicode char is in a Control char category, etc. ## However it seems that the ToUnice fucntion is not factoring in whether shift is pressed or not, so it always returns a lower case letter. ## So for now, look up all keys and if a result is found, replace the 'key' field with it; but leave the ucode field as is. ## I 'think' we should be getting upper case unicode chars if shift is pressed, so this needs to be looked into. # # also look in built in key mappings; if a key label is returned, use it instead of the unicode char (it could be a non visible key) lookupkey,_=KeyboardConstants._getKeyNameAndModsForEvent(event) if lookupkey and len(lookupkey)>0: key=lookupkey #import ctypes #_dll=ctypes.windll.user32 #kstatus=ctypes.c_short(_dll.GetAsyncKeyState(ctypes.c_int(k.key_id))) #print k.key, ' status is %x pressed=%d was_pressed=%d'%(kstatus.value,(kstatus.value&0x80)>0,(kstatus.value&0x01)>0) return [0, 0, 0, #device id (not currently used) Computer._getNextEventID(), etype, device_time, notifiedTime, time, confidence_interval, delay, 0, event.RepeatCount, event.ScanCode, event.KeyID, uchar, key, event.Modifiers, event.Window ] except: printExceptionDetailsToStdErr()
def _getIOHubEventObject(self, native_event_data): try: notifiedTime, event = native_event_data etype = event.Type # # Start Tracking Modifiers that are pressed # keyID = event.KeyID modKeyName = Keyboard._win32_modifier_mapping.get(keyID, None) if modKeyName: mod_value = KeyboardConstants._modifierCodes.getID(modKeyName) if keyID == win32_vk.VK_CAPITAL and etype == EventConstants.KEYBOARD_PRESS: if self._keyboard_state[keyID] > 0: self._keyboard_state[keyID] = 0 ioHubKeyboardDevice._modifier_value -= mod_value else: self._keyboard_state[keyID] = 0x01 ioHubKeyboardDevice._modifier_value += mod_value elif etype == EventConstants.KEYBOARD_PRESS and \ self._keyboard_state[keyID] == 0: self._keyboard_state[keyID] = 0x80 ioHubKeyboardDevice._modifier_value += mod_value if keyID in [win32_vk.VK_LSHIFT, win32_vk.VK_RSHIFT] and \ self._keyboard_state[ win32_vk.VK_SHIFT] == 0: self._keyboard_state[win32_vk.VK_SHIFT] = 0x80 #print2err("SETTING shift ",keyID) elif keyID in [win32_vk.VK_LCONTROL, win32_vk.VK_RCONTROL] and \ self._keyboard_state[ win32_vk.VK_CONTROL] == 0: self._keyboard_state[win32_vk.VK_CONTROL] = 0x80 #print2err("SETTING CTRL: ",keyID) elif keyID in [win32_vk.VK_LMENU, win32_vk.VK_RMENU] and \ self._keyboard_state[win32_vk.VK_MENU] == 0: self._keyboard_state[win32_vk.VK_MENU] = 0x80 #print2err("SETTING VK_MENU: ",keyID) elif etype == EventConstants.KEYBOARD_RELEASE and keyID != win32_vk.VK_CAPITAL: if self._keyboard_state[ keyID] != 0 and keyID != win32_vk.VK_CAPITAL: ioHubKeyboardDevice._modifier_value -= mod_value self._keyboard_state[keyID] = 0 if modKeyName.find('SHIFT') >= 0 and \ self._keyboard_state[ win32_vk.VK_LSHIFT] == 0 and \ self._keyboard_state[ win32_vk.VK_RSHIFT] == 0: self._keyboard_state[win32_vk.VK_SHIFT] = 0 # print2err("CLEAR VK_SHIFT: ",keyID) elif modKeyName.find('CONTROL') >= 0 and \ self._keyboard_state[ win32_vk.VK_LCONTROL] == 0 and \ self._keyboard_state[ win32_vk.VK_RCONTROL] == 0: self._keyboard_state[win32_vk.VK_CONTROL] = 0 #print2err("CLEAR VK_CONTROL: ",keyID) elif modKeyName.find('ALT') >= 0 and \ self._keyboard_state[ win32_vk.VK_LMENU] == 0 and \ self._keyboard_state[ win32_vk.VK_RMENU] == 0: self._keyboard_state[win32_vk.VK_MENU] = 0 #print2err("CLEAR VK_MENU: ",keyID) # # End Tracking Modifiers that are pressed # if ioHubKeyboardDevice._modifier_value is None: ioHubKeyboardDevice._modifier_value = 0 event.Modifiers = ioHubKeyboardDevice._modifier_value # From MSDN: # http://msdn.microsoft.com/en-us/library/windows/desktop/ms644939(v=vs.85).aspx # The time is a long integer that specifies the elapsed time, # in milliseconds, from the time the system was started to the # time the message was created (that is, placed in the thread's # message queue).REMARKS: The return value from the GetMessageTime # function does not necessarily increase # between subsequent messages, because the value wraps to zero if # the timer count exceeds the maximum value for a long integer. # To calculate time delays between messages, verify that the time # of the second message is greater than the time of the first # message; then, subtract the time of the first message from the # time of the second message. device_time = event.Time / 1000.0 # convert to sec time = notifiedTime # since this is a keyboard device using a callback method, # confidence_interval is not applicable confidence_interval = 0.0 # since this is a keyboard, we 'know' there is a delay, but until # we support setting a delay in the device properties based on # external testing for a given keyboard, we will leave at 0. delay = 0.0 # ## check for unicode char # # uchar holds the unicode ord() number for the unicode char. # unichr(uchar) == u'x' unicode str uchar = 0 # the intent of key is to provide the same key mapping as # the psychopy event.getkeys that uses pyglet returns. key = event.Key try: key = key.lower() except: pass # If pyHook returns an oem_xxx key value, use our lookup tables # to find an alternative..... if key[:3] == 'oem': key = KeyboardConstants._getKeyNameAndModsForEvent(event)[0].lower() # Do some adjustments so key aligns with psychopy key constants if key in self._psychopy_key_mismatches: key = self._psychopy_key_mismatches[key] elif key.startswith('numpad'): key = 'num_%s'%(key[6:]) # char holds the unicode char, in 8 bit string format encoded # in UTF-8. Safe to transmit and to store in pytables. # Use char.decode('utf-8') to get the unicode sybol decoded # into a unicode string. char = None # ucat holds the unicode character category. This can be used to # tell what king of code point it is. For a list of categories see # http://www.unicode.org/reports/tr44/#General_Category_Values # examples: # u'v' -> category: Ll # u'<-' (ESCAPE) -> category: Cc # u' ' -> category: Zs ucat = None result = self._user32.ToUnicode(event.KeyID, event.ScanCode, ctypes.byref(self._keyboard_state), ctypes.byref(self._unichar), 8, 0) if result > 0: if result == 1: char = self._unichar[0].encode('utf-8') uchar = ord(self._unichar[0]) ucat = ucategory(self._unichar[0]) else: for c in range(result): uchar = ord(self._unichar[c]) ucat = ucategory(self._unichar[c]) char = self._unichar[0:result] char = char.encode('utf-8') elif result == -1: # The specified virtual char is a dead-key character # (accent or diacritic). This value is returned regardless of # the keyboard layout, even if several characters have been # typed and are stored in the keyboard state. If possible, # even with Unicode keyboard layouts, the function has written # a spacing version of the dead-key character to the buffer # specified by pwszBuff. For example, the function writes the # character SPACING ACUTE (0x00B4), rather than the character # NON_SPACING ACUTE (0x0301). char = self._unichar[0].encode('utf-8') uchar = ord(self._unichar[0]) ucat = ucategory(self._unichar[0]) lukey, _ = KeyboardConstants._getKeyNameAndModsForEvent(event) if result == 0 or ucat and ucat[0] == 'C': if lukey and len(lukey) > 0: char = lukey kb_event = [0, 0, 0, #device id (not currently used) Computer._getNextEventID(), etype, device_time, notifiedTime, time, confidence_interval, delay, 0, event.RepeatCount, event.ScanCode, event.KeyID, uchar, key, event.Modifiers, event.Window, char, # .char 0.0, # duration 0 # press_event_id ] #print2err("KEY: {0}, CHAR: {1}".format(key,char)) ioHubKeyboardDevice._updateKeyboardEventState(self, kb_event, is_press=( etype == EventConstants.KEYBOARD_PRESS)) return kb_event except: printExceptionDetailsToStdErr()
# installer) using your package manager, then, with pip, install its ... # # DEPENDENCIES: nltk. # # ============================================================================================= import sys, os, re from unicodedata import category as ucategory, normalize from nltk.tokenize import word_tokenize as tokenize ucats = { r'Cc', r'Cf', r'Co', r'Cs', r'Pe', r'Po', r'Ps', r'Nd', r'Sc', r'Sm', r'So', r'Zl', r'Zp' } junkToSpace = dict.fromkeys(i for i in range(sys.maxunicode) if (ucategory(chr(i)) in ucats)) junkToSpace = {cat: r' ' for cat in junkToSpace} argc = len(sys.argv) if ((argc > 4) or (argc < 3) or ((not os.path.isfile(sys.argv[-1]) and (sys.argv[-1] != r'-')))): sys.stderr.write("Usage: '" + sys.argv[0] + "' [N] HOW_MANY TEXT_FILE|-\n") exit(1) elif (sys.argv[-1] == r'-'): bulk = sys.stdin.buffer.read().decode(errors='ignore') else: bulk = open(sys.argv[-1], 'rb').read().decode(errors='ignore') bulk = re.sub(r'\s+', r' ', normalize('NFC', bulk.translate(junkToSpace)).casefold()) bulk = [re.sub(r'\s+', '_', (' ' + t + ' ')) for t in tokenize(bulk.strip())] try:
def isGoodToken(token): #if (ucategory(token[0]) in {r'Lt', r'Lu'} and (lang != 'de')): return False if ((ucategory(token[0])[0] != r'L') and (ucategory(token[-1])[0] != r'L')): return False return token
def isJunk(c): return ((ucategory(chr(c)) in junk) or (chr(c) in _s))