Python tokenise Exemples, tokenise.tokenise Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : wrap.py Projet : JonHurst/rewrap

def main():
    if len(sys.argv) != 3 or not os.path.isfile(
            sys.argv[1]) or not os.path.isfile(sys.argv[2]):
        print "Usage: %s template_file input_file" % sys.argv[0]
        sys.exit(-1)
    #process template file(s) into para list
    t_string = unicode(file(sys.argv[1]).read(), "utf-8")
    t_tokens = tokenise.tokenise(t_string)
    t_para_list = split_paras(t_tokens)
    #process input file into para list
    i_string = unicode(file(sys.argv[2]).read(), "utf-8")
    i_tokens = tokenise.tokenise(i_string)
    i_para_list = split_paras(i_tokens)
    #sanity check -- must be the same number of paras in template and input
    if len(t_para_list) != len(i_para_list):
        print "Number of paragraphs\n template: %s\n input: %s" % (
            len(t_para_list), len(i_para_list))
        sys.exit(-2)
    #initialise logger
    logger = Logger(t_string, i_string)
    wrapped_paras = []
    for c, (t_para, i_para) in enumerate(zip(t_para_list, i_para_list)):
        logger.set_current_para(c, c)
        if t_para != i_para:
            wrapped_paras.append(wrap_para(t_para, i_para, logger))
        else:
            wrapped_paras.append(i_para)
    outfile = open(sys.argv[2] + ".wrap", "w")
    outfile.write(build_output(wrapped_paras).encode("utf-8"))

Exemple #2

0

Afficher le fichier

Fichier : wrap.py Projet : JonHurst/rewrap

def main():
    if len(sys.argv) != 3 or not os.path.isfile(sys.argv[1]) or not os.path.isfile(sys.argv[2]):
        print "Usage: %s template_file input_file" % sys.argv[0]
        sys.exit(-1)
    # process template file(s) into para list
    t_string = unicode(file(sys.argv[1]).read(), "utf-8")
    t_tokens = tokenise.tokenise(t_string)
    t_para_list = split_paras(t_tokens)
    # process input file into para list
    i_string = unicode(file(sys.argv[2]).read(), "utf-8")
    i_tokens = tokenise.tokenise(i_string)
    i_para_list = split_paras(i_tokens)
    # sanity check -- must be the same number of paras in template and input
    if len(t_para_list) != len(i_para_list):
        print "Number of paragraphs\n template: %s\n input: %s" % (len(t_para_list), len(i_para_list))
        sys.exit(-2)
    # initialise logger
    logger = Logger(t_string, i_string)
    wrapped_paras = []
    for c, (t_para, i_para) in enumerate(zip(t_para_list, i_para_list)):
        logger.set_current_para(c, c)
        if t_para != i_para:
            wrapped_paras.append(wrap_para(t_para, i_para, logger))
        else:
            wrapped_paras.append(i_para)
    outfile = open(sys.argv[2] + ".wrap", "w")
    outfile.write(build_output(wrapped_paras).encode("utf-8"))

Exemple #3

0

Afficher le fichier

def manual(**kwargs):
    startDate = kwargs['startDate']
    endDate = kwargs['endDate']

    frame = search_console.get_data(startDate=startDate, endDate=endDate)
    frame['report_date'] = pd.to_datetime('today')
    result = tokenise.tokenise(frame=frame, col_name='query')
    data_to_bq.send_data_bq(frame=result,
                            name='gsc_manual',
                            writeType='WRITE_APPEND')

Exemple #4

0

Afficher le fichier

def main():
    """Loads the template and input files and processes them into an output file. The output file
    will have the same name as the input file with ".paramatch" appended."""
    if len(sys.argv) != 3 or not os.path.isfile(sys.argv[1]) or not os.path.isfile(sys.argv[2]):
        print "Usage: %s template_file input_file" % sys.argv[0]
        sys.exit(-1)
    #process template file(s) into para list
    t_string = unicode(file(sys.argv[1]).read(), "utf-8")
    t_tokens = tokenise.tokenise(t_string)
    t_para_list = split_and_sign_paras(t_tokens)
    #process input file into para list
    i_string = unicode(file(sys.argv[2]).read(), "utf-8")
    i_tokens = tokenise.tokenise(i_string)
    i_para_list = split_and_sign_paras(i_tokens)
    #process token lists
    logger = wrap.Logger(t_string, i_string)
    matches = build_match_list(t_para_list, i_para_list)
    matches = process_matches(matches, t_para_list, i_para_list, logger)
    # for m in matches:
    #     print "%04d = %04d : %3d%%" % (m[0][0], m[1][0], int(math.ceil(min(*m[2]) * 100)))
    output = build_output(t_para_list, i_para_list, matches, logger)
    file(sys.argv[2] + ".paramatch", "w").write(output.encode("utf-8"))

Exemple #5

0

Afficher le fichier

def main(filename):
    with open(filename, "r") as f:
        output = ""
        data = f.readline()
        imports = []
        cIL = 0
        while data:
            parsed = parse(tokenise(data), imports=imports,
                           currentIndentationLevel=cIL)
            output += parsed[0]
            cIL = parsed[1]
            imports = parsed[2]

            data = f.readline()
    output = "from typing import Union\n" + output

    subprocess.run(["python3", "-c", output])

Exemple #6

0

Afficher le fichier

def main(argv):
    INPUT_FILE = ''
    OUTPUT_FILE = ''
    n = None

    try:
        opts, args = getopt.getopt(argv, "hi:o:n:",
                                   ["ifile=", "ofile=", "ngram="])
    except getopt.GetoptError:
        print 'generate_grams.py -i <INPUT_FILE> -o <OUTPUT_FILE> -n <N>'
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            print 'generate_grams.py -i <inputfile> -o <outputfile> -n <N>'
            sys.exit()
        elif opt in ("-i", "--ifile"):
            INPUT_FILE = arg
        elif opt in ("-o", "--ofile"):
            OUTPUT_FILE = arg
        elif opt in ("-n", "--ngram"):
            n = int(arg)

    SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
    LIB_PATH = os.path.join(SCRIPT_PATH, 'lib')

    sys.path.append(LIB_PATH)
    from tokenise import tokenise

    print("Generating %(n)s-grams for %(INPUT_FILE)s" % locals())
    print("Tokenising activity has started.")
    grams_list = tokenise(n, INPUT_FILE)
    print("Tokenising activity has completed.")

    fd = FreqDist(grams_list)

    print("Writing frequency distribution to file.")
    with open(OUTPUT_FILE, 'w') as results:
        writer = csv.writer(results, delimiter="|")
        for gram, count in fd.items():
            writer.writerow([
                unicode(' '.join(gram[0:-1])).encode("utf8"),
                unicode(gram[-1]).encode("utf8"),
                str(count)
            ])
    print("Writing frequency distribution to file completed.")

Exemple #7

0

Afficher le fichier

Fichier : compiler.py Projet : leet-hakker/Peanut

def main(filename):
    with open(filename, "r") as f:
        output = ""
        data = f.readline()
        imports = []
        cIL = 0
        while data:
            parsed = parse(tokenise(data), imports=imports,
                           currentIndentationLevel=cIL)
            output += parsed[0]
            cIL = parsed[1]
            imports = parsed[2]

            data = f.readline()
        output = "from typing import Union\n" + output
    with open(f"{filename.split('.pnut')[0]}.py", "w") as f:
        f.write(output)

Exemple #8

0

Afficher le fichier

Fichier : generate_grams.py Projet : kylase/data-science-capstone

def main(argv):
    INPUT_FILE = ''
    OUTPUT_FILE = ''
    n = None

    try:
        opts, args = getopt.getopt(argv,"hi:o:n:",["ifile=","ofile=","ngram="])
    except getopt.GetoptError:
        print 'generate_grams.py -i <INPUT_FILE> -o <OUTPUT_FILE> -n <N>'
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            print 'generate_grams.py -i <inputfile> -o <outputfile> -n <N>'
            sys.exit()
        elif opt in ("-i", "--ifile"):
            INPUT_FILE = arg
        elif opt in ("-o", "--ofile"):
            OUTPUT_FILE = arg
        elif opt in ("-n", "--ngram"):
            n = int(arg)

    SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
    LIB_PATH = os.path.join(SCRIPT_PATH, 'lib')

    sys.path.append(LIB_PATH)
    from tokenise import tokenise
    
    print("Generating %(n)s-grams for %(INPUT_FILE)s" % locals())
    print("Tokenising activity has started.")
    grams_list = tokenise(n, INPUT_FILE)
    print("Tokenising activity has completed.")

    fd = FreqDist(grams_list)

    print("Writing frequency distribution to file.")
    with open(OUTPUT_FILE, 'w') as results:
        writer = csv.writer(results, delimiter = "|")
        for gram, count in fd.items():
            writer.writerow([unicode(' '.join(gram[0:-1])).encode("utf8"), unicode(gram[-1]).encode("utf8"), str(count)])
    print("Writing frequency distribution to file completed.")

Exemple #9

0

Afficher le fichier

Fichier : parser.py Projet : leet-hakker/Peanut

def resolve_dependencies(textLineArray,
                         i,
                         outputText,
                         imports=[],
                         currentIndentationLevel=0):
    if textLineArray[i+1] == "module":
        if textLineArray[i+2] not in imports:
            print(f"Resolving dependency {textLineArray[i+2]}...")
        if not os.path.isfile(f"builtins/{textLineArray[i+2]}.py"):
            raise DependencyError(
                f"{textLineArray[i+2]} is not a builtin.")

        if len(textLineArray) == 3:
            with open(f"builtins/{textLineArray[i+2]}.py") as f:
                data = f.read()
                outputText += data
                imports.append(textLineArray[i+2])
        else:
            with open(f"builtins/{textLineArray[i+2]}.py") as f:
                imports.append(textLineArray[i+4])
                name = textLineArray[i+4].upper()
                data = f.readline()
                begin = False
                while data:
                    begin = bool(int(bool(
                        re.match(
                            f"# --- {name} BEGIN", data)
                    )) + int(begin))
                    outputText += bool(begin)*data
                    if not re.match(f"# --- {name} END", data):
                        data = f.readline()
                    else:
                        break
    elif textLineArray[i+1] == "local":
        if textLineArray[i+2] not in imports:
            print(f"Resolving dependency {textLineArray[i+2]}...")
        found = False
        path = os.path.dirname(os.path.realpath(__file__))
        for filename in glob.iglob(f'{path}/*', recursive=True):
            found = filename == f"{path}/{textLineArray[i+2]}.pnut"
            if found:
                with open(f"{filename}", "r") as f:
                    if len(textLineArray) == 3:
                        imports.append(textLineArray[i+2])
                        line = f.readline()
                        while line:
                            tokenised = tokenise(line)
                            cIL = currentIndentationLevel
                            parsed = parse(tokenised,
                                           currentIndentationLevel=cIL)
                            outputText += parsed[0]
                            currentIndentationLevel = parsed[1]
                            line = f.readline()
                        break
                    else:
                        with open(f"{filename}") as f:
                            imports.append(textLineArray[i+4])
                            name = textLineArray[i+4].upper()
                            data = f.readline()
                            begin = False
                            while data:
                                if len(data.replace('\n', '')):
                                    if data.split()[0] == "use":
                                        cIL = currentIndentationLevel
                                        outputText, _ = resolve_dependencies(
                                            tokenise(data),
                                            0,
                                            outputText,
                                            currentIndentationLevel=cIL
                                        )
                                begin = bool(int(bool(
                                    re.search(
                                        f"# --- {name} BEGIN", data)
                                )) + int(begin))
                                tokenised = tokenise(bool(begin)*data)
                                cIL = currentIndentationLevel
                                parsed = parse(tokenised,
                                               currentIndentationLevel=cIL)
                                outputText += parsed[0]
                                currentIndentationLevel = parsed[1]
                                if not re.search(
                                        f"# --- {name} END", data):
                                    data = f.readline()
                                else:
                                    break
                        break
        if not found:
            raise DependencyError(f"""
            Could not find {textLineArray[i+2]}.pnut in the local path
            """)

    return outputText, currentIndentationLevel, imports

Exemple #10

0

Afficher le fichier

Fichier : diff_validator.py Projet : JonHurst/dpng

café, The following. narrative falls naturally into three
divisions, corresponding to distinct and clearly
marked periods of Sophy's life. Of the first and
second-her childhood at Morpingham and her so-
journ in Pavis--the records are fragmentary, and"""
    goodwords = ""
    serial = "1234"
else:
    cgitb.enable()
    form = cgi.FieldStorage()
    if not form.has_key('text'):
        text = u""
    else:
        text = unicode(form['text'].value, "utf-8")
    serial = form.getfirst("serial", "0000")
    projid = form.getfirst("projid", "")
    page_id = form.getfirst("page_id", "")
    check_text = unicode(file("../data/%s/alt-ed/%s" % (projid, page_id)).read(), "utf-8")

print "Content-type: text/html; charset=UTF-8\n"
translate_table = dict(zip([ord(X) for X in u"“”‘’"], [ord(X) for X in u"\"\"''"]))
tokens = tokenise.tokenise(text.translate(translate_table))[:-1]
check_tokens = tokenise.tokenise(check_text.translate(translate_table))[:-1]
calculate_classes(tokens, check_tokens)
sys.stdout.write(build_text(tokens).encode("utf-8"))

Exemple #11

0

Afficher le fichier

from tokenise import tokenise
from token_process import process_tokens
import numpy as np

corpus = raw_input("Enter the name of corpus file: ")
V = int(raw_input("Enter size of vocabulary: "))
#p=process_tokens(corpus)
t = tokenise(corpus)
t.token_create()
np.save("unigram.dict", t.unigram)
np.save("bigram.dict", t.bigram)
np.save("n1w1.dict", t.n1w1)
np.save("n1w2.dict", t.n1w2)
np.save("trigram.dict", t.trigram)
np.save("n1w1w2.dict", t.n1w1w2)
np.save("n1w2w3.dict", t.n1w2w3)

info = {
    'corpus': corpus,
    'V': V,
    "chEp": t.chEp,
    "n1Ep": t.n1Ep,
    "n1w2s": t.n1w2s
}
np.save("info.dict", info)

Exemple #12

0

Afficher le fichier

 def handle_data(self, data):
     if not self.tags or self.tags[-1] != "body":
         return
     self.current_body = self.current_body.union(
         set(tokenise(data.rstrip('\n'))))