def read_files_count(filepath): tessobj = TessFile(filepath) tokengenerator = iter(tessobj.read_tokens()) stop = 0 while stop != 1: try: rawtoken = next(tokengenerator) cleantoken_list = token_cleanup(rawtoken) token = cleantoken_list[0] countgram(token) except StopIteration: stop = 1
def read_files(filepath): '''Moves through a .tess file and calls the 'next' and 'count_lemma' functions as needed. Updates the SKIP_LIBRARY global object. Parameters ---------- filepath: a file in .tess format ''' tessobj = TessFile(filepath) tokengenerator = iter(tessobj.read_tokens()) stop = 0 while stop != 1: try: rawtoken = next(tokengenerator) cleantoken_list = token_cleanup(rawtoken) count_lemma(cleantoken_list[0]) except StopIteration: stop = 1
def test_read_tokens(self, tessfile_list): for f in tessfile_list: lines = [] with open(f, 'r') as tess: for line in tess.readlines(): lines.append(line) t_b = TessFile(f) t_r = TessFile(f, buffer=False) # Ensure that tokens omit the tag when requested # Grab all tokens from the text tokens = [] for line in lines: start = line.find('>') if start >= 0: tokens.extend(line[start + 1:].strip( string.whitespace).split()) # Test with buffer for i, token in enumerate(t_b.read_tokens()): # print(token, tokens[i]) assert token == tokens[i] # Ensure that the iterator resets reset = False for i, token in enumerate(t_b.read_tokens()): assert token == tokens[i] reset = True assert reset # Test with initial read for i, token in enumerate(t_r.read_tokens()): assert token == tokens[i] # Ensure that the iterator resets reset = False for i, token in enumerate(t_r.read_tokens()): assert token == tokens[i] reset = True assert reset # Ensure that tokens include the tag when requested # Lines now start before the tag tokens = [] for line in lines: tokens.extend(line.strip().split()) # Test with buffer for i, token in enumerate(t_b.read_tokens(include_tag=True)): print(token, tokens[i]) assert token == tokens[i] # Ensure that the iterator resets reset = False for i, token in enumerate(t_b.read_tokens(include_tag=True)): assert token == tokens[i] reset = True assert reset # Test with initial read for i, token in enumerate(t_r.read_tokens(include_tag=True)): assert token == tokens[i] # Ensure that the iterator resets reset = False for i, token in enumerate(t_r.read_tokens(include_tag=True)): assert token == tokens[i] reset = True assert reset
all_lemmas_total = sum([COUNT_LIBRARY[l] for l in lemmas]) try: lemmalist = [(l, (COUNT_LIBRARY[l] / all_lemmas_total)) for l in lemmas] except ZeroDivisionError: print([(COUNT_LIBRARY[l], l) for l in lemmas]) return lemmalist else: lemmalist = [] lemmaobj = (lemmas[0], 1) lemmalist.append(lemmaobj) return lemmalist tessobj = TessFile(onlyfiles[258]) tokengenerator = iter(tessobj.read_tokens()) tokens = new_file(tokengenerator, 2) target = tokens.pop(0) compare_context(target, tokens) rel_path = os.path.join( '~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff') path = os.path.expanduser(rel_path) file = 'latin_pos_lemmatized_sents.pickle' latin_pos_lemmatized_sents_path = os.path.join(path, file) if os.path.isfile(latin_pos_lemmatized_sents_path): latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path) else: print('The file %s is not available in cltk_data' % file) first1000 = latin_pos_lemmatized_sents[0:1000]
def read_files_skipgram(filepath, context_window): '''Moves through a .tess file and calls the 'next' and 'skipgram' functions as needed. Updates the SKIP_LIBRARY global object. Parameters ---------- filepath: a file in .tess format context_window: how many words on either side of the target to look at. ''' tessobj = TessFile(filepath) tokengenerator = iter(tessobj.read_tokens()) tokens = new_file(tokengenerator, context_window) stop = 0 clearflag = 0 target_position = context_window while stop != 1: #the target should be five away from the end of the file, until the end # can't just pop the target token; we want to keep it for the next round. targettoken = tokens[target_position] #grab all the other tokens but the target contexttokens = [x for i, x in enumerate(tokens) if i != target_position] #add this context to the skipgram map skipgram(targettoken, contexttokens) #prep the next token in the file try: rawtoken = next(tokengenerator) cleantoken_list = token_cleanup(rawtoken) if len(cleantoken_list) > 1 and cleantoken_list[-1] in punctuation_list: #this should indicate a sentence has ended. #when this happens, it's necessary to clear the list *after* this iteration. clearflag = 1 tokens.append(cleantoken_list[0]) # if we've seen end-of-sentence punctuation, we need to start counting down. if clearflag == 1: # when this begins, the token list just received the final word. tokens.pop(0) while len(tokens) > context_window: # perform the usual dictionary operation, but don't add a new token. targettoken = tokens[target_position] contexttokens = [x for i, x in enumerate(tokens) if i != target_position] skipgram(targettoken, contexttokens) tokens.pop(0) #initialize the next sentence tokens = [] tokens = new_file(tokengenerator, context_window) clearflag = 0 else: tokens.pop(0) except StopIteration: #we have reached EOF. Loop through until the last token is done then quit #when this happens, the token list should have 11 indices, and the 'target_position' #index will be the sixth (i.e. :tokens[5]). Pop the first index off, leaving 10 #indices and making the sixth index (previously the seventh) the new target. # this entire loop is obsolete now that punctuation is accounted for. try: tokens.pop(0) except IndexError: pass while len(tokens) > (context_window): # This loop makes the target_position move to the end. E.g. if the context_window is 6, then # as long as there are six or more indexes, make the target_position the sixth index. targettoken = tokens[target_position] #grab all the other tokens but the target contexttokens = [x for i, x in enumerate(tokens) if i != target_position] #add this context to the skipgram map skipgram(targettoken, contexttokens) tokens.pop(0) stop = 1