def main(): if len(sys.argv) != 3 or not os.path.isfile( sys.argv[1]) or not os.path.isfile(sys.argv[2]): print "Usage: %s template_file input_file" % sys.argv[0] sys.exit(-1) #process template file(s) into para list t_string = unicode(file(sys.argv[1]).read(), "utf-8") t_tokens = tokenise.tokenise(t_string) t_para_list = split_paras(t_tokens) #process input file into para list i_string = unicode(file(sys.argv[2]).read(), "utf-8") i_tokens = tokenise.tokenise(i_string) i_para_list = split_paras(i_tokens) #sanity check -- must be the same number of paras in template and input if len(t_para_list) != len(i_para_list): print "Number of paragraphs\n template: %s\n input: %s" % ( len(t_para_list), len(i_para_list)) sys.exit(-2) #initialise logger logger = Logger(t_string, i_string) wrapped_paras = [] for c, (t_para, i_para) in enumerate(zip(t_para_list, i_para_list)): logger.set_current_para(c, c) if t_para != i_para: wrapped_paras.append(wrap_para(t_para, i_para, logger)) else: wrapped_paras.append(i_para) outfile = open(sys.argv[2] + ".wrap", "w") outfile.write(build_output(wrapped_paras).encode("utf-8"))
def main(): if len(sys.argv) != 3 or not os.path.isfile(sys.argv[1]) or not os.path.isfile(sys.argv[2]): print "Usage: %s template_file input_file" % sys.argv[0] sys.exit(-1) # process template file(s) into para list t_string = unicode(file(sys.argv[1]).read(), "utf-8") t_tokens = tokenise.tokenise(t_string) t_para_list = split_paras(t_tokens) # process input file into para list i_string = unicode(file(sys.argv[2]).read(), "utf-8") i_tokens = tokenise.tokenise(i_string) i_para_list = split_paras(i_tokens) # sanity check -- must be the same number of paras in template and input if len(t_para_list) != len(i_para_list): print "Number of paragraphs\n template: %s\n input: %s" % (len(t_para_list), len(i_para_list)) sys.exit(-2) # initialise logger logger = Logger(t_string, i_string) wrapped_paras = [] for c, (t_para, i_para) in enumerate(zip(t_para_list, i_para_list)): logger.set_current_para(c, c) if t_para != i_para: wrapped_paras.append(wrap_para(t_para, i_para, logger)) else: wrapped_paras.append(i_para) outfile = open(sys.argv[2] + ".wrap", "w") outfile.write(build_output(wrapped_paras).encode("utf-8"))
def manual(**kwargs): startDate = kwargs['startDate'] endDate = kwargs['endDate'] frame = search_console.get_data(startDate=startDate, endDate=endDate) frame['report_date'] = pd.to_datetime('today') result = tokenise.tokenise(frame=frame, col_name='query') data_to_bq.send_data_bq(frame=result, name='gsc_manual', writeType='WRITE_APPEND')
def main(): """Loads the template and input files and processes them into an output file. The output file will have the same name as the input file with ".paramatch" appended.""" if len(sys.argv) != 3 or not os.path.isfile(sys.argv[1]) or not os.path.isfile(sys.argv[2]): print "Usage: %s template_file input_file" % sys.argv[0] sys.exit(-1) #process template file(s) into para list t_string = unicode(file(sys.argv[1]).read(), "utf-8") t_tokens = tokenise.tokenise(t_string) t_para_list = split_and_sign_paras(t_tokens) #process input file into para list i_string = unicode(file(sys.argv[2]).read(), "utf-8") i_tokens = tokenise.tokenise(i_string) i_para_list = split_and_sign_paras(i_tokens) #process token lists logger = wrap.Logger(t_string, i_string) matches = build_match_list(t_para_list, i_para_list) matches = process_matches(matches, t_para_list, i_para_list, logger) # for m in matches: # print "%04d = %04d : %3d%%" % (m[0][0], m[1][0], int(math.ceil(min(*m[2]) * 100))) output = build_output(t_para_list, i_para_list, matches, logger) file(sys.argv[2] + ".paramatch", "w").write(output.encode("utf-8"))
def main(filename): with open(filename, "r") as f: output = "" data = f.readline() imports = [] cIL = 0 while data: parsed = parse(tokenise(data), imports=imports, currentIndentationLevel=cIL) output += parsed[0] cIL = parsed[1] imports = parsed[2] data = f.readline() output = "from typing import Union\n" + output subprocess.run(["python3", "-c", output])
def main(argv): INPUT_FILE = '' OUTPUT_FILE = '' n = None try: opts, args = getopt.getopt(argv, "hi:o:n:", ["ifile=", "ofile=", "ngram="]) except getopt.GetoptError: print 'generate_grams.py -i <INPUT_FILE> -o <OUTPUT_FILE> -n <N>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'generate_grams.py -i <inputfile> -o <outputfile> -n <N>' sys.exit() elif opt in ("-i", "--ifile"): INPUT_FILE = arg elif opt in ("-o", "--ofile"): OUTPUT_FILE = arg elif opt in ("-n", "--ngram"): n = int(arg) SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__)) LIB_PATH = os.path.join(SCRIPT_PATH, 'lib') sys.path.append(LIB_PATH) from tokenise import tokenise print("Generating %(n)s-grams for %(INPUT_FILE)s" % locals()) print("Tokenising activity has started.") grams_list = tokenise(n, INPUT_FILE) print("Tokenising activity has completed.") fd = FreqDist(grams_list) print("Writing frequency distribution to file.") with open(OUTPUT_FILE, 'w') as results: writer = csv.writer(results, delimiter="|") for gram, count in fd.items(): writer.writerow([ unicode(' '.join(gram[0:-1])).encode("utf8"), unicode(gram[-1]).encode("utf8"), str(count) ]) print("Writing frequency distribution to file completed.")
def main(filename): with open(filename, "r") as f: output = "" data = f.readline() imports = [] cIL = 0 while data: parsed = parse(tokenise(data), imports=imports, currentIndentationLevel=cIL) output += parsed[0] cIL = parsed[1] imports = parsed[2] data = f.readline() output = "from typing import Union\n" + output with open(f"{filename.split('.pnut')[0]}.py", "w") as f: f.write(output)
def main(argv): INPUT_FILE = '' OUTPUT_FILE = '' n = None try: opts, args = getopt.getopt(argv,"hi:o:n:",["ifile=","ofile=","ngram="]) except getopt.GetoptError: print 'generate_grams.py -i <INPUT_FILE> -o <OUTPUT_FILE> -n <N>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'generate_grams.py -i <inputfile> -o <outputfile> -n <N>' sys.exit() elif opt in ("-i", "--ifile"): INPUT_FILE = arg elif opt in ("-o", "--ofile"): OUTPUT_FILE = arg elif opt in ("-n", "--ngram"): n = int(arg) SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__)) LIB_PATH = os.path.join(SCRIPT_PATH, 'lib') sys.path.append(LIB_PATH) from tokenise import tokenise print("Generating %(n)s-grams for %(INPUT_FILE)s" % locals()) print("Tokenising activity has started.") grams_list = tokenise(n, INPUT_FILE) print("Tokenising activity has completed.") fd = FreqDist(grams_list) print("Writing frequency distribution to file.") with open(OUTPUT_FILE, 'w') as results: writer = csv.writer(results, delimiter = "|") for gram, count in fd.items(): writer.writerow([unicode(' '.join(gram[0:-1])).encode("utf8"), unicode(gram[-1]).encode("utf8"), str(count)]) print("Writing frequency distribution to file completed.")
def resolve_dependencies(textLineArray, i, outputText, imports=[], currentIndentationLevel=0): if textLineArray[i+1] == "module": if textLineArray[i+2] not in imports: print(f"Resolving dependency {textLineArray[i+2]}...") if not os.path.isfile(f"builtins/{textLineArray[i+2]}.py"): raise DependencyError( f"{textLineArray[i+2]} is not a builtin.") if len(textLineArray) == 3: with open(f"builtins/{textLineArray[i+2]}.py") as f: data = f.read() outputText += data imports.append(textLineArray[i+2]) else: with open(f"builtins/{textLineArray[i+2]}.py") as f: imports.append(textLineArray[i+4]) name = textLineArray[i+4].upper() data = f.readline() begin = False while data: begin = bool(int(bool( re.match( f"# --- {name} BEGIN", data) )) + int(begin)) outputText += bool(begin)*data if not re.match(f"# --- {name} END", data): data = f.readline() else: break elif textLineArray[i+1] == "local": if textLineArray[i+2] not in imports: print(f"Resolving dependency {textLineArray[i+2]}...") found = False path = os.path.dirname(os.path.realpath(__file__)) for filename in glob.iglob(f'{path}/*', recursive=True): found = filename == f"{path}/{textLineArray[i+2]}.pnut" if found: with open(f"{filename}", "r") as f: if len(textLineArray) == 3: imports.append(textLineArray[i+2]) line = f.readline() while line: tokenised = tokenise(line) cIL = currentIndentationLevel parsed = parse(tokenised, currentIndentationLevel=cIL) outputText += parsed[0] currentIndentationLevel = parsed[1] line = f.readline() break else: with open(f"{filename}") as f: imports.append(textLineArray[i+4]) name = textLineArray[i+4].upper() data = f.readline() begin = False while data: if len(data.replace('\n', '')): if data.split()[0] == "use": cIL = currentIndentationLevel outputText, _ = resolve_dependencies( tokenise(data), 0, outputText, currentIndentationLevel=cIL ) begin = bool(int(bool( re.search( f"# --- {name} BEGIN", data) )) + int(begin)) tokenised = tokenise(bool(begin)*data) cIL = currentIndentationLevel parsed = parse(tokenised, currentIndentationLevel=cIL) outputText += parsed[0] currentIndentationLevel = parsed[1] if not re.search( f"# --- {name} END", data): data = f.readline() else: break break if not found: raise DependencyError(f""" Could not find {textLineArray[i+2]}.pnut in the local path """) return outputText, currentIndentationLevel, imports
café, The following. narrative falls naturally into three divisions, corresponding to distinct and clearly marked periods of Sophy's life. Of the first and second-her childhood at Morpingham and her so- journ in Pavis--the records are fragmentary, and""" goodwords = "" serial = "1234" else: cgitb.enable() form = cgi.FieldStorage() if not form.has_key('text'): text = u"" else: text = unicode(form['text'].value, "utf-8") serial = form.getfirst("serial", "0000") projid = form.getfirst("projid", "") page_id = form.getfirst("page_id", "") check_text = unicode(file("../data/%s/alt-ed/%s" % (projid, page_id)).read(), "utf-8") print "Content-type: text/html; charset=UTF-8\n" translate_table = dict(zip([ord(X) for X in u"“”‘’"], [ord(X) for X in u"\"\"''"])) tokens = tokenise.tokenise(text.translate(translate_table))[:-1] check_tokens = tokenise.tokenise(check_text.translate(translate_table))[:-1] calculate_classes(tokens, check_tokens) sys.stdout.write(build_text(tokens).encode("utf-8"))
from tokenise import tokenise from token_process import process_tokens import numpy as np corpus = raw_input("Enter the name of corpus file: ") V = int(raw_input("Enter size of vocabulary: ")) #p=process_tokens(corpus) t = tokenise(corpus) t.token_create() np.save("unigram.dict", t.unigram) np.save("bigram.dict", t.bigram) np.save("n1w1.dict", t.n1w1) np.save("n1w2.dict", t.n1w2) np.save("trigram.dict", t.trigram) np.save("n1w1w2.dict", t.n1w1w2) np.save("n1w2w3.dict", t.n1w2w3) info = { 'corpus': corpus, 'V': V, "chEp": t.chEp, "n1Ep": t.n1Ep, "n1w2s": t.n1w2s } np.save("info.dict", info)
def handle_data(self, data): if not self.tags or self.tags[-1] != "body": return self.current_body = self.current_body.union( set(tokenise(data.rstrip('\n'))))