Esempio n. 1
0
def get_tokenized_sentences(text, language):
    lang = ""

    if language == "english": lang = "en"
    if language == "hindi": lang = "hi"
    if language == "telugu": lang = "te"

    tk = Tokenizer(lang=lang, split_sen=True)

    tokens = tk.tokenize(text)

    return get_sentences(tokens)
Esempio n. 2
0
class Root(Tk):
    def __init__(self):
        super(Root, self).__init__()
        self.title("Tkinter Browse")
        self.minsize(1000, 600)
        self.labelFrame = ttk.Label(self, text="")
        self.labelFrame.grid(row=0, column=1)
        self.label_frame = ttk.Frame(self, height=50)
        self.label_frame.grid(
            row=10)  # Stops child widgets of label_frame from resizing it

        self.entry = ttk.Entry(self.labelFrame, text="", width=50)
        self.entry.grid()
        self.button()

    def button(self):

        self.button = ttk.Button(self.labelFrame,
                                 text="Browse",
                                 command=self.filedialog)
        self.button.grid(column=3, row=0)

        #self.entry1=ttk.Entry(self.label_frame,width=50)

    # self.entry1.grid(row=10)
    ## self.label1=ttk.Label(self.labelFrame,text="",width=100)
    #self.label1.grid(row=10)

    def filedialog(self):
        #root.filename = tkFileDialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("jpeg files","*.jpg"),("all files","*.*")))

        self.filename = filedialog.askopenfilename(initialdir="/",
                                                   title="Select file")
        self.entry.insert(0, self.filename)
        self.text = docxpy.process(self.filename)
        print(self.text)
        self.txt = ScrolledText(self.labelFrame, width=100, height=20)
        self.txt.grid(row=7)
        self.txt.insert(const.END, self.text)
        #self.label1.config(text=self.text)
        # self.entry1.insert(0,self.text)
        self.token()

    def entry(self):

        self.entry = ttk.Entry(self.labelFrame, text="", width=50)
        self.entry.grid(column=1, row=6, padx=5, pady=60, ipady=3)
        return

    def token(self):
        self.tkn = Tokenizer(lang='ml',
                             smt=True)  #smt is a flag for social-media-text
        #self.text = docxpy.process(self.filename)
        print(self.tkn.tokenize(self.text))
Esempio n. 3
0
def tokenize_data(data_path, lang, forcesave=False):
    """
    Load and save tokenized data
    """

    tokenized_data_path = data_path + ".ptok"
    if not forcesave:
        if path.exists(tokenized_data_path):
            logger.info("Tokenized file already present at {}".format(
                tokenized_data_path))
            n_sents = 0
            with codecs.open(tokenized_data_path, "rb") as fp:
                n_sents = len(pickle.load(fp))
            return tokenized_data_path, n_sents

    data_tuple = []
    n_sents = 0
    with codecs.open(data_path, 'r', encoding='utf-8') as fp:
        logger.info("Loading whole data in memory ...")
        textlines = fp.readlines()
        tok = Tokenizer(lang=lang, split_sen=True)
        tokenized_sents = tok.tokenize_lines(textlines)
        for tokens in tokenized_sents:
            sent = []
            for token in tokens:
                # Necessary to use as tuple for caching
                # while generating features based on
                # previous, current and next word
                sent.append((token, "", ""))
            data_tuple.append(sent)
            n_sents += 1
        logger.info("Tokenization done")

    with codecs.open(tokenized_data_path, "wb") as wt:
        logger.info("Writing data into pickle format")
        pickle.dump(data_tuple, wt, protocol=pickle.HIGHEST_PROTOCOL)
        logger.info("Data written")

    return tokenized_data_path, n_sents
Esempio n. 4
0
def load_data(text_type, filename, lang, tokenize_text=False, split_sent=True):
    data_tuple = []
    with codecs.open(filename, 'r', encoding='utf-8') as fp:
        logger.info('Loading text_type: %s format' % (text_type))
        if text_type == "ssf":
            start_c = -1
            for line in fp:
                line = line.strip()
                ds = line.split()
                #print("Line", line)
                #print("DS", ds)
                if line == "":
                    continue
                elif line[0:2] == "<S":
                    sent = []
                elif line[0:3] == "</S":
                    data_tuple.append(sent)
                elif line[0] == "<":
                    continue
                elif ds[0] == "0" or ds[0] == "))":
                    continue
                elif ds[1] == "((":
                    start_c, chunk_tag = 1, ds[2]
                    #print "hello-chunk tag",chunk_tag
                if len(ds) > 2:
                    if ds[2]:
                        #print "--",line,"--"
                        word, tag = ds[1], ds[2]
                        if start_c == -1:
                            sent.append((word, tag, ""))
                        if start_c == 1:
                            sent.append((word, tag, "B-%s" % (chunk_tag)))
                            start_c = 0
                        if start_c == 0:
                            sent.append((word, tag, "I-%s" % (chunk_tag)))
        elif text_type == "conll":
            sent = []
            for line in fp:
                line = line.strip()
                ds = line.split()
                if line != "":
                    print(line)
                    if len(ds) == 2:
                        word, tag, chunk = ds[1], "",""
                    if len(ds) == 3:
                        word, tag, chunk = ds[1], ds[2], ""
                    if len(ds) == 4:
                        word, tag, chunk = ds[1], ds[2], ds[3]
                    sent.append([word, tag, chunk])
                else:
                    data_tuple.append(sent)
                    sent = []
        elif text_type == "txt":
            if split_sent == True:
                text = fp.read()
                tok = Tokenizer(lang=lang, split_sen=split_sent)
                tokenized_sents = tok.tokenize(text)
                sent = []
                for tokens in tokenized_sents:
                    for token in tokens:
                        sent.append([token, "", ""])
                    data_tuple.append(sent)
            else:
                for line in fp:
                    sent = []
                    if tokenize_text:
                        tok = Tokenizer(lang=lang, split_sen=False)
                        tokenized_sents = tok.tokenize(line)
                    for tokens in tokenized_sents:
                        for token in tokens:
                            sent.append([token, "", ""])
                    data_tuple.append(sent)
        else:
            print("Check - text_type", text_type)

    return data_tuple
Esempio n. 5
0
def pipeline():
    curr_dir = path.dirname(path.abspath(__file__))
    args = get_args()

    output_dir = path.join(path.dirname(path.abspath(__file__)), "outputs")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    data_writer.set_logger(args.model_type, output_dir)

    if True:
        model_path = "%s/models/%s/%s.%s.%s.model" % (
            curr_dir, args.language, args.model_type, args.tag_type,
            args.encoding)
        if args.model_type == "lstm":
            if args.tag_type == "pos":
                model_path = "%s/models/%s/lstm/" % (curr_dir, args.language)
            elif args.tag_type == "chunk":
                model_path = "%s/models/%s/lstm/chunk/" % (curr_dir,
                                                           args.language)
            elif args.tag_type == "ner":
                model_path = "%s/models/%s/lstm/ner/" % (curr_dir,
                                                         args.language)
    if args.tag_type != "parse":
        if not os.path.exists(model_path):
            os.makedirs(model_path)

    if args.pipeline_type == 'train':
        logger.info('Start Training#')
        logger.info('Tagger model type: %s' % (args.model_type))
        data_path = "%s/data/train/%s/train.%s.%s" % (
            curr_dir, args.language, args.encoding, args.data_format)
        if args.tag_type == "ner":
            data_path = data_path + ".ner"

        data_sents = data_reader.load_data(args.data_format, data_path,
                                           args.language)

        no_words = sum(len(sent) for sent in data_sents)
        logger.info("No. of words: %d" % (no_words))
        logger.info("No. of sents: %d" % (len(data_sents)))

        X_data = [
            generate_features.sent2features(s, args.tag_type, args.model_type)
            for s in data_sents
        ]
        y_data = [
            generate_features.sent2labels(s, args.tag_type) for s in data_sents
        ]

        X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                            y_data,
                                                            test_size=0.10,
                                                            random_state=42)

        print('Train data size:', len(X_train), len(y_train))
        print('Test data size:', len(X_test), len(y_test))
        print('Lang:', args.language)
        print('Train data: ', data_path)
        print('Model Path: ', model_path)
        if args.model_type == "crf":
            tagger = CRF(model_path)
            tagger.train(X_train, y_train)
            tagger.load_model()
            tagger.test(X_test, y_test)
        elif args.model_type == "lstm":
            x_data, y_data1, y_data2 = load_data_and_labels(data_path)
            if args.tag_type == "pos":
                x_train, x_test, y_train1, y_test1 = train_test_split(
                    x_data, y_data1, test_size=0.10,
                    random_state=42)  #Split the data into train and test
                model = Sequence()  #Intialize BiLSTM model
                model.fit(x_train, y_train1,
                          epochs=10)  #Train the model for 10 echos
                print(model.score(x_test,
                                  y_test1))  #Run the model on test data
                model.save(model_path + "/weights.h5",
                           model_path + "/params.json",
                           model_path + "/preprocessor.json")
            if args.tag_type == "chunk":
                x_train, x_test, y_train2, y_test2 = train_test_split(
                    x_data, y_data2, test_size=0.10,
                    random_state=42)  #Split the data into train and test
                model = Sequence()  #Intialize BiLSTM model
                model.fit(x_train, y_train2,
                          epochs=10)  #Train the model for 10 echos
                print(model.score(x_test,
                                  y_test2))  #Run the model on test data
                model.save(model_path + "/weights.h5",
                           model_path + "/params.json",
                           model_path + "/preprocessor.json")
            if args.tag_type == "ner":
                x_train, x_test, y_train1, y_test1 = train_test_split(
                    x_data, y_data1, test_size=0.10,
                    random_state=42)  #Split the data into train and test
                model = Sequence()  #Intialize BiLSTM model
                model.fit(x_train, y_train1,
                          epochs=10)  #Train the model for 10 echos
                print(model.score(x_test,
                                  y_test1))  #Run the model on test data
                model.save(model_path + "/weights.h5",
                           model_path + "/params.json",
                           model_path + "/preprocessor.json")

    if args.pipeline_type == "test":
        if args.model_type == "crf":
            test_data_path = "%s/%s" % (curr_dir, args.test_data)

            test_sents = data_reader.load_data(args.data_format,
                                               test_data_path,
                                               args.language,
                                               tokenize_text=False)
            X_test = [
                generate_features.sent2features(s, args.tag_type,
                                                args.model_type)
                for s in test_sents
            ]
            y_test = [
                generate_features.sent2labels(s, args.tag_type)
                for s in test_sents
            ]
            tagger = CRF(model_path)
            tagger.load_model()
            tagger.test(X_test, y_test)

    if args.pipeline_type == "predict":

        test_data_path = "%s" % (args.test_data)
        test_sents = data_reader.load_data(args.data_format,
                                           test_data_path,
                                           args.language,
                                           tokenize_text=True,
                                           split_sent=args.sent_split)
        if args.tag_type == "parse":
            #Pos tagging
            X_test = [
                generate_features.sent2features(s, "pos", args.model_type)
                for s in test_sents
            ]

            tag_model_path = "%s/models/%s/%s.%s.%s.model" % (
                curr_dir, args.language, args.model_type, "pos", args.encoding)
            chunk_model_path = "%s/models/%s/%s.%s.%s.model" % (
                curr_dir, args.language, args.model_type, "chunk",
                args.encoding)

            if args.model_type == "crf":
                tagger = CRF(tag_model_path)
                tagger.load_model()
                y_pos = tagger.predict(X_test)

                test_sents_pos = generate_features.append_tags(
                    test_sents, "pos", y_pos)
                X_test = [
                    generate_features.sent2features(s, "chunk",
                                                    args.model_type)
                    for s in test_sents_pos
                ]

                chunker = CRF(chunk_model_path)
                chunker.load_model()
                y_chunk = chunker.predict(X_test)

                test_fname = path.basename(test_data_path)
                output_file = "%s/%s.parse" % (output_dir, test_fname)
                data_writer.write_anno_to_file(output_file, test_sents_pos,
                                               y_chunk, "chunk")
                logger.info("Output in: %s" % output_file)
                data_writer.write_to_screen(output_file)
        else:
            X_test = [
                generate_features.sent2features(s, args.tag_type,
                                                args.model_type)
                for s in test_sents
            ]

            if args.model_type == "crf":
                tagger = CRF(model_path)
                tagger.load_model()
                y_pred = tagger.predict(X_test)
                data_writer.write_anno_to_file(args.output_path, test_sents,
                                               y_pred, args.tag_type)
                data_writer.write_to_screen(args.output_path)
                logger.info("Output in: %s" % args.output_path)

            if args.model_type == "lstm":
                model = Sequence().load(model_path + "/weights.h5",
                                        model_path + "/params.json",
                                        model_path + "/preprocessor.json")
                f = open(args.test_data, "r")
                sent = f.read()
                tok = Tokenizer(lang=args.language, split_sen=True)
                tokenized_sents = tok.tokenize(sent)
                for tokens in tokenized_sents:
                    for token in tokens:
                        sent = sent + " " + token
                    sent = sent.strip()
                    print(model.analyze(sent))
Esempio n. 6
0
# -*- coding: utf-8 -*-

from __future__ import unicode_literals
from indictrans import Transliterator
from polyglot_tokenizer import Tokenizer

flag = True
s = 'hin'
t = 'eng'

forward_transl_full = Transliterator(source=s, target=t, build_lookup=True)

forward_transl_token = Transliterator(source=s, target=t, decode='beamsearch')
back_transl_token = Transliterator(source=t, target=s, build_lookup=True)

tk = Tokenizer(lang=s[:2])
tk_back = Tokenizer(lang=t[:2])

l = u"रज्ज के रुलाया"  #\nरज्ज के हंसाया\n\nमैंने दिल खो' के इश्क़ कमाया\n"

l = l.lower().strip()

lines = l.split("\n")
print(lines)

output = []
if flag == True:
    for l in lines:
        json = {}

        definitive = forward_transl_full.transform(l)
Esempio n. 7
0
from __future__ import unicode_literals
from polyglot_tokenizer import Tokenizer
tk = Tokenizer(lang='ml', smt=True)  #smt is a flag for social-media-text
text = "രണ്ട് വർഷംമുമ്പ് നടന്ന നിയമസഭാ തെരഞ്ഞെടുപ്പിൽ തിരിച്ചടി ലഭിച്ചതിനുശേഷം ഗുജറാത്തിൽ ബിജെപിയുടേത് ഒരുതരം ഞാണിൻമേൽക്കളിയാണ്. കഴിഞ്ഞ ലോക്സഭാ തെരഞ്ഞെടുപ്പിൽ ആകെയുള്ള 26 സീറ്റിലും വിജയിച്ച ബിജെപി ഇക്കുറി അത് നിലനിർത്താനായി  എല്ലാ വൃത്തികെട്ട കളിയും പുറത്തെടുക്കുകയാണ്."
print(tk.tokenize(text))
Esempio n. 8
0
def process_args(args):

    if not (args.ml or args.rb):
        args.rb = True
    if args.infile:
        ifp = io.open(args.infile, encoding='utf-8')
    else:
        if sys.version_info[0] >= 3:
            ifp = codecs.getreader('utf8')(sys.stdin.buffer)
        else:
            ifp = codecs.getreader('utf8')(sys.stdin)

    if args.outfile:
        ofp = io.open(args.outfile, mode='w', encoding='utf-8')
    else:
        if sys.version_info[0] >= 3:
            ofp = codecs.getwriter('utf8')(sys.stdout.buffer)
        else:
            ofp = codecs.getwriter('utf8')(sys.stdout)

    # LIMIT CASES # BUGS INDIC-TRANS
    # HARD_CODED
    if args.target == "urd" or args.source == 'urd':
        args.build_lookup = False

    # SELECT REGEX TO SEARCH WORDS OFFSETS INSIDE A DOCUMENT (INDIAN -> UTF, ENG -> ASCII)
    if args.source == 'eng' and args.target in ISO_3to2 and args.target != 'eng':
        # UTF8 unicode parser regex
        def my_regex(word):
            return r"(?<!\S){}(?!\S)".format(re.escape(word))
    else:
        # ASCII romanized parser regex
        def my_regex(word):
            return r"\b{}\b".format(re.escape(word))

    if args.output_format == 'stdout':

        # initialize transliterator object
        trn = Transliterator(args.source,
                             args.target,
                             rb=args.rb,
                             build_lookup=args.build_lookup)

        # transliterate text
        for line in ifp:

            if args.source == 'hin' and args.target == 'eng':
                replacements = {
                    u"\u0950": "om",
                    u"\u0915\u092e\u0932": "kamal"
                }
                for script, roman in replacements.iteritems():
                    line = line.replace(script, roman)

            tline = trn.convert(line)

            if u"whatsapp" in tline and args.target == 'eng' and u"whatsapp" not in line:
                tline = tline.replace("whatsapp", "vhaatsapp")

            ofp.write(tline)

        # close files
        ifp.close()
        ofp.close()

    elif args.output_format == 'json':

        # getting source language from terminal
        source = args.source

        # getting target language from terminal
        target = args.target

        # Full forward ( source lang -> target lang) transliterator at SENTENCE LEVEL
        forward_transl_full = Transliterator(source=source,
                                             target=target,
                                             rb=args.rb,
                                             build_lookup=args.build_lookup)

        # forward ( source lang -> target lang) transliterator at TOKEN LEVEL, we use this to trasliterate every token indipendently
        # from source to target lang with multiple choices (beamsearch)
        forward_transl_token = Transliterator(source=source,
                                              target=target,
                                              rb=args.rb,
                                              decode='beamsearch')

        # backward ( target lang -> source lang) transliterator at TOKEN LEVEL, we use this to check backtranslitteration of result
        back_transl_token = Transliterator(source=target,
                                           target=source,
                                           rb=args.rb,
                                           build_lookup=args.build_lookup)

        # Tokenizer of source language
        tk = Tokenizer(lang=ISO_3to2[source])

        # Tokenizer of target language
        tk_back = Tokenizer(lang=ISO_3to2[target])

        # Soundex instance object for checking phonetically similarity between words
        instance = Soundex()

        # array of output sentences
        output = []

        # seen vector to recognize words that already have been processed
        seen = {}

        # read entire source text to transliterate
        document_input = ifp.read()

        # document_input divided by lines
        lines = document_input.splitlines()

        # progressive translitted text
        document_translitted = u""

        # for every line
        for l in lines:

            # Treat special cases
            if u"\u0950" in l and source == 'hin' and target == 'eng':
                l = l.replace(u"\u0950", "om")

            # prepare a json for every line
            json = {}

            #transform entire sentence as first choice
            definitive = forward_transl_full.transform(l)

            # add traslitted line to entire translitted text
            document_translitted += definitive + u"\n"

            # tokenize initial sentence in tokens
            tokens = tk.tokenize(l)

            #backtokenize text transformed
            back_tokens = tk_back.tokenize(clean_str(definitive))

            # text field is sentence first choice without alternatives (stdout mode)

            #json["tokenization"] = back_tokens

            json["tokens"] = []

            # count index token inside a sentence, without punctuation
            count_tokens = 0

            # zip token and translitterated token
            for index, (t, choosen) in enumerate(zip(tokens, back_tokens)):

                inner_json = {}

                # suggestions for choosen tokens
                suggestions = []

                exclusions = []

                # translitterate token from source sentence with beamsearch ( 5 results)
                forward_out = forward_transl_token.transform(t)

                # open alternatives
                for c in forward_out:

                    # for every alternatives, back-translitterate it
                    back_out = back_transl_token.transform(c)

                    # if back-translitterated token is equal to initial token, but the result of translitteration of two token is different, one is suggestion .
                    if back_out == t and c != choosen:
                        suggestions.append(clean_str(c))
                    else:
                        if c != choosen:
                            exclusions.append(clean_str(c))

                # add choosen to all possible choices [ suggestion + choice]
                all_possible_choices = list(suggestions)
                all_possible_choices.insert(0, choosen)

                # transform all suggestion (+ choosen) to phonetical alphabet with soundex
                transformed = []

                for c in all_possible_choices:
                    p = instance.soundex(c)
                    transformed.append(p)

                duplicates = {}

                for p, original_text in zip(transformed, all_possible_choices):

                    if p not in duplicates:
                        duplicates[p] = []
                        duplicates[p].append(clean_str(original_text))

                    else:
                        duplicates[p].append(clean_str(original_text))

                new_duplicates = {}
                suggestion_duplicates = []

                # for every
                for _, v in duplicates.items():
                    new_duplicates[v[0]] = v[1:]
                    suggestion_duplicates.extend(v[1:])

                #my_regex = u'(\s|^)%s(\s|$)'  % choosen
                #my_regex = r"\b" + re.escape(choosen) + r"\b"

                if source == 'kan' and target == 'eng':

                    new_choosen = resolveKannada(choosen, exclusions)

                    if new_choosen != choosen:

                        exclusions.remove(new_choosen)
                        exclusions.append(choosen)

                        if choosen in new_duplicates:
                            new_duplicates[new_choosen] = new_duplicates.pop(
                                choosen)

                        if new_choosen not in json["text"]:
                            json["text"] = json["text"].replace(
                                choosen, new_choosen)

                            new_last_line = document_translitted.strip().split(
                                u"\n")[-1].replace(choosen, new_choosen)
                            document_translitted = u'\n'.join(
                                document_translitted.split(u"\n")
                                [0:-2]) + "\n" + new_last_line + "\n"

                        choosen = new_choosen

                if (source == 'hin' or source == 'kan') and target == 'eng':
                    new_choosen = resolveHindi(choosen, exclusions)

                    if new_choosen != choosen:

                        exclusions.remove(new_choosen)
                        exclusions.append(choosen)

                        if choosen in new_duplicates:
                            new_duplicates[new_choosen] = new_duplicates.pop(
                                choosen)

                        if new_choosen not in json["text"]:
                            json["text"] = json["text"].replace(
                                choosen, new_choosen)

                            new_last_line = document_translitted.strip().split(
                                u"\n")[-1].replace(choosen, new_choosen)
                            document_translitted = u'\n'.join(
                                document_translitted.split(u"\n")
                                [0:-2]) + "\n" + new_last_line + "\n"

                        choosen = new_choosen

                if target == 'eng':
                    if choosen == "whatsapp" and t != "whatsapp":
                        definitive = definitive.replace(
                            "whatsapp", "vhaatsapp")
                        choosen = "vhaatsapp"
                        new_last_line = document_translitted.strip().split(
                            u"\n")[-1].replace("whatsapp", "vhaatsapp")
                        document_translitted = u'\n'.join(
                            document_translitted.split(u"\n")
                            [0:-2]) + "\n" + new_last_line + "\n"
                        exclusions.append("whatsapp")

                    if choosen == t:
                        exclusions = []

                json["text"] = definitive

                r = re.compile(my_regex(choosen),
                               flags=re.I | re.X | re.UNICODE)

                # calculate length of this choosen token
                length = len([1 for c in choosen if not c in UNICODE_NSM_ALL])

                for m in r.finditer(document_translitted.strip()):

                    # take every occurrence found inside full text
                    word = m.group()

                    characterOffsetBegin = m.start()
                    characterOffsetEnd = characterOffsetBegin + length - 1

                    found = -1

                    if word in seen:
                        found = seen[word]

                    if characterOffsetBegin > found:
                        count_tokens += 1
                        seen[word] = characterOffsetEnd

                        inner_json["source"] = t
                        inner_json["token"] = choosen
                        inner_json["index"] = count_tokens
                        inner_json["duplicates"] = new_duplicates
                        inner_json["exclusions"] = exclusions
                        inner_json["suggestions"] = [
                            s for s in suggestions
                            if s not in suggestion_duplicates
                        ]
                        inner_json[
                            'characterOffsetBegin'] = characterOffsetBegin
                        inner_json['characterOffsetEnd'] = characterOffsetEnd
                        json["tokens"].append(inner_json)

                        break

            output.append(json)

        final_output = {"sentences": output}

        r = js.dumps(final_output)
        ofp.write(r)

        # close files
        ifp.close()
        ofp.close()

    else:

        # close files
        ifp.close()
        ofp.close()
Esempio n. 9
0
 def token(self):
     self.tkn = Tokenizer(lang='ml',
                          smt=True)  #smt is a flag for social-media-text
     #self.text = docxpy.process(self.filename)
     print(self.tkn.tokenize(self.text))