def main(args):
    if os.path.exists(args.output_file):
        raise FileExistsError('File exists: {}'.format(args.output_file))
    csv_reader = pd.read_csv(args.input_file,
                             chunksize=args.batch_size,
                             usecols=[
                                 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE',
                                 'CATEGORY', 'DESCRIPTION', 'ISERROR', 'TEXT'
                             ],
                             dtype={
                                 'SUBJECT_ID': int32,
                                 'HADM_ID': 'str',
                                 'CATEGORY': 'str',
                                 'DESCRIPTION': 'str',
                                 'ISERROR': 'str',
                                 'TEXT': 'str'
                             },
                             keep_default_na=False,
                             na_values='')

    with jsonlines.open(args.output_file, 'w') as notes_tokenized_file:
        tokenizer = Tokenizer(args.batch_size, args.n_cpus, args.n_threads,
                              MODE)

        for i, notes_batch in enumerate(csv_reader):
            process_batch(notes_batch, i, tokenizer, notes_tokenized_file)
Example #2
0
    def load(cls, path, fields, tokenizer_lang, tokenizer_dir, use_gpu, verbose=True, max_sent_length=math.inf):
        tokenizer = Tokenizer(lang=tokenizer_lang, dir=tokenizer_dir, use_gpu=use_gpu, verbose=True)

        sentences = []
        fields = [field if field is not None else Field(str(i))
                  for i, field in enumerate(fields)]

        with open(path, 'r') as f:
            lines = []
            for line in tokenizer.format(tokenizer.predict(f.read())):
                line = line.strip()
                if not line:
                    if len(lines) > max_sent_length:
                        logger.info('Discarded sentence longer than max_sent_length:',
                              len(lines), file=sys.stderr)
                        lines = []
                        continue
                    sentences.append(Sentence(fields, lines))
                    lines = []
                else:
                    if not line.startswith('#'):
                        # append fake columns
                        line = '{}\t{}'.format(line, '\t'.join(['_' for i in range(len(CoNLL._fields) - len(line.split('\t')))]))
                        assert len(CoNLL._fields) == len(line.split('\t')), '{} - {} vs {}'.format(line, len(CoNLL._fields), len(line.split()))
                    lines.append(line)

        return cls(fields, sentences)
Example #3
0
    def __init__(self,
                 NLG_param_dir,
                 NLG_model_fname,
                 tokenizer,
                 NLU_param_dir=None,
                 NLU_model_fname=None):
        self.tokenizer = Tokenizer(tokenizer, '../tokenizer/e2e.model')
        self.tokenizer_mode = tokenizer
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        saved_data = torch.load(
            NLG_param_dir.rstrip('/') + '/' + NLG_model_fname)
        self.model_NLG = saved_data['model']
        f = open(NLG_param_dir.rstrip('/') + '/dictionary.json',
                 'r',
                 encoding='utf-8')
        self.dictionary = json.load(f)
        f.close()

        # beam-search settings
        self.n_beam = 5

        # NLU
        if (NLU_param_dir is not None) and (NLU_model_fname is not None):
            self.NLU = NLU(NLU_param_dir, NLU_model_fname, tokenizer)
        else:
            self.NLU = None
Example #4
0
def initialize_tokenizer(vocabulary_path, is_bpe=True):
    """Initialize vocabulary from file.

    We assume the vocabulary is stored one-item-per-line, so a file:
      dog
      cat
    will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
    also return the reversed-vocabulary ["dog", "cat"].

    Args:
      vocabulary_path: path to the file containing the vocabulary.

    Returns:
      a pair: the vocabulary (a dictionary mapping string to integers), and
      the reversed vocabulary (a list, which reverses the vocabulary mapping).

    Raises:
      ValueError: if the provided vocabulary_path does not exist.
    """
    if is_bpe:
        return bpe_tokenizer
    if os.path.isfile(vocabulary_path):
        rev_vocab = []
        with open(vocabulary_path, mode="r") as f:
            rev_vocab.extend(f.readlines())
        rev_vocab = [line.strip() for line in rev_vocab]
        return Tokenizer(_UNK, vocab_list=rev_vocab)
    else:
        raise ValueError("Vocabulary file %s not found.", vocabulary_path)
    def test_corpus_load(self):
        tokenizer = Tokenizer(**self.args)
        raw_text_file = '/project/piqasso/Collection/IWPT20/train-dev/UD_Italian-ISDT/it_isdt-ud-dev.txt'

        with open(raw_text_file) as fin:
            for line in tokenizer.format(tokenizer.predict(fin.read())):
                if line and not line.startswith('#'):
                    assert len(line.split('\t')) == 2, line
Example #6
0
 def get_embedding_fn(self, max_length=12):
     self.max_length = max_length
     self.s = Simplifier('tokenizer/zh_mapping.txt')
     self.t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model',
         'tokenizer/lg.all.voc',
         max_length
     )
     self.max_length = max_length
     return self.embedding
Example #7
0
    def test_corpus_load(self):
        tokenizer = Tokenizer(**self.args)
        sin = io.StringIO(
            "Un corazziere contro Scalfaro. L'attore le disse baciami o torno a riprendermelo."
        )

        for line in tokenizer.format(tokenizer.predict(sin.read())):
            if line and not line.startswith('#'):
                assert len(line.split('\t')) == 10, line
Example #8
0
    def predict(self,
                data,
                pred=None,
                buckets=8,
                batch_size=5000,
                prob=False,
                **kwargs):
        r"""
        Parses the data and produces a parse tree for each sentence.
        Args:
            data (str or list[list[str]]): input to be parsed: either
                  - a str, that will be tokenized first with the tokenizer for the parser language
                  - a path to a file to be read, either in CoNLL-U format or in plain text if :param text: is supplied.
                  - a list of lists of tokens
            text (str): optional, specifies that the input data is in plain text in the specified language code.
            pred (str or file): a path to a file where to write the parsed input in CoNLL-U fprmat.
            bucket (int): the number of buckets used to group sentences to parallelize matrix computations.
            batch_size (int): group sentences in batches.
            prob (bool): whther to return also probabilities for each arc.
        Return:
            a Dataset containing the parsed sentence trees.
        """
        args = self.args.update(locals())
        init_logger(logger, verbose=args.verbose)

        self.transform.eval()
        if args.prob:
            self.transform.append(Field('probs'))

        if isinstance(data, str) and (not conll_format(data) or args.text):
            self.transform.reader = Tokenizer(args.text,
                                              dir=args.cache_dir,
                                              verbose=args.verbose).reader()

        logger.info("Loading the data")
        dataset = Dataset(self.transform, data)
        dataset.build(args.batch_size, args.buckets)
        logger.info(f"\n{dataset}")

        logger.info("Making predictions on the dataset")
        start = datetime.now()
        preds = self._predict(dataset.loader)
        elapsed = datetime.now() - start

        for name, value in preds.items():
            setattr(dataset, name, value)
        if pred is not None and is_master():
            logger.info(f"Saving predicted results to {pred}")
            self.transform.save(pred, dataset.sentences)
        logger.info(
            f"{elapsed}s elapsed, {len(dataset) / elapsed.total_seconds():.2f} Sents/s"
        )

        return dataset
Example #9
0
 def __init__(self, param_dir, model_fname, tokenizer):
     self.tokenizer = Tokenizer(tokenizer, '../tokenizer/e2e.model')
     self.tokenizer_mode = tokenizer
     self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     saved_data = torch.load(param_dir.rstrip('/')+'/'+model_fname)
     self.model = saved_data['model']
     f = open(param_dir.rstrip('/')+'/dictionary.json', 'r', encoding='utf-8')
     self.dictionary = json.load(f)
     f.close()
     # beam-search settings
     self.n_beam = 5
Example #10
0
 def test_download_resources(self):
     self.assertTrue(not os.path.exists(self.MODEL_DIR))
     tokenizer = Tokenizer(**self.args)
     self.assertTrue(
         os.path.exists(self.args['dir'])
         and not os.path.isfile(self.args['dir']))
     self.assertTrue(
         os.path.exists(os.path.join(self.args['dir'], self.args['lang'])))
     self.assertTrue(
         os.path.exists(
             os.path.join(self.args['dir'], self.args['lang'], 'tokenize')))
Example #11
0
    def test_download_resources(self):
        tokenizer = Tokenizer(self.args['lang'])

        self.assertTrue(os.path.isdir(self.MODEL_DIR))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.MODEL_DIR, 'tokenizer', self.args['lang'])))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.MODEL_DIR, 'tokenizer', self.args['lang'],
                             'tokenize')))
Example #12
0
def main():
    dataset = data_loader.load_text_file("data_2.txt")
    tokenizer = Tokenizer()
    separated = tokenizer.tokenize([dataset])
    morfeusz = MorfeuszWrapperLexeme()
    for sentence in separated:
        analysed = morfeusz.analyse([w for w, tag in sentence])
        print(analysed)
        for word, analysis in analysed.items():
            print("{}:".format(word))
            print_analysis(analysis)
        print()
Example #13
0
def tokenize(path):
    t = Tokenizer(path)
    tokens = t.raw_tokenize()

    idents = []
    for token in tokens:
        if token.kind.name == "IDENTIFIER":
            name = token.spelling.lower()
            name = re.sub("_", "", name)
            idents.append(name)

    return "\n".join(idents)
Example #14
0
def main():
    text = 'Charakteryzował się on ustawieniem zawodników w kształcie piramidy' \
          ' – bramkarz - 2 obrońców - 3 pomocników - 5 napastników (1-2-3-5).'
    morfeusz = MorfeuszWrapperLexeme()
    tokenizer = Tokenizer()
    text = tokenizer.tokenize([text])
    for sen in text:
        analysed = morfeusz.analyse([w for w, tag in sen], as_xml=False)
        print(analysed)

        analysed = morfeusz.analyse([w for w, tag in sen], as_xml=True)
        print(analysed)
        print()
Example #15
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        if self.args.feat in ('char', 'bert'):
            self.WORD, self.FEAT = self.transform.FORM
        else:
            self.WORD, self.FEAT = self.transform.FORM, self.transform.CPOS
        self.ARC, self.REL = self.transform.HEAD, self.transform.DEPREL
        self.puncts = torch.tensor([
            i for s, i in self.WORD.vocab.stoi.items() if ispunct(s)
        ]).to(self.args.device)
        if getattr(self.args, 'text', None):
            self.transform.reader = Tokenizer(self.args.text,
                                              self.args.cache_dir).reader()
Example #16
0
def test_ner(crf, test_sent):
    from tokenizer.tokenizer import Tokenizer
    token = Tokenizer()
    token.run()
    arr_featurized_sent = []
    postaged_sent = ViPosTagger.postagging(token.predict(test_sent))
    print postaged_sent
    test_arr = []
    for i in xrange(len(postaged_sent[0])):
        test_arr.append((postaged_sent[0][i], postaged_sent[1][i]))
    print test_arr
    featurized_sent = sent2features(test_arr)
    arr_featurized_sent.append(featurized_sent)
    predict = crf.predict(arr_featurized_sent)
    return zip(test_arr, predict[0])
Example #17
0
    def __init__(self, normalized= True, classes= None, stemmed= True):
        if classes is None:
            classes = ["positive", "negative", "notr"]

        self.x = []
        self.y = []
        self.tokenizer = Tokenizer()
        self.stemmer = TurkishStemmer()
        self.word2vec = None

        self.cachefile = "data/data" + ("_normalized" if normalized else "") + ("_stemmed" if stemmed else "") + "_" + ("_".join(classes)) + ".pickle"
        if os.path.isfile(self.cachefile):
            with open(self.cachefile, 'rb') as cache:
                self.x, self.y = pickle.load(cache)
        else:
            for cls in classes:
                self._append_data(cls, normalized, stemmed)

            with open(self.cachefile, 'wb') as cache:
                pickle.dump((self.x, self.y), cache)
Example #18
0
def first_stats():
    tokenizer = Tokenizer()
    tokenizer.run()
    question_vocabulary = Vocabulary()

    questions = load_questions()
    cc = 0
    for question in questions:
        #print question
        if cc % 10 == 0:
            print "\r%s" % cc,
        cc += 1
        sen = tokenizer.predict(question)
        sen = sen.lower()
        tokens = question_vocabulary.get_sentence_token_ids(sen)
        question_list.append(tokens)
    print "\n Saving..."
    question_vocabulary.save(Q_VOCAB_NAME)
    utils.pickle_save(question_list, "question_tokens.dat")

    print "Done"
Example #19
0
def tokenize(path):
    t = Tokenizer(path)
    tokens = t.raw_tokenize()

    items = []
    for token in tokens:
        if token.kind.name == "LITERAL":
            text = token.spelling
            cursor_kind = clang.cindex.CursorKind
            kind = token.cursor.kind

            if kind == cursor_kind.STRING_LITERAL:
                # do extra processing on strings
                text = sha256(mangle_text(token.spelling)).hexdigest()[:10]

            items.append(text)

        if token.kind.name == "COMMENT":
            hashed = sha256(mangle_text(token.spelling[2:])).hexdigest()[:10]
            items.append(hashed)

    return "\n".join(items)
Example #20
0
    def load(cls,
             path,
             fields,
             tokenizer_lang,
             tokenizer_dir,
             verbose=True,
             max_sent_length=math.inf):
        tokenizer = Tokenizer(lang=tokenizer_lang,
                              dir=tokenizer_dir,
                              verbose=verbose)

        sentences = []
        fields = [
            field if field is not None else Field(str(i))
            for i, field in enumerate(fields)
        ]

        with open(path, 'r') as f:
            lines = []
            for line in tokenizer.format(tokenizer.predict(f.read())):
                line = line.strip()
                if not line:
                    if len(lines) > max_sent_length:
                        logger.info(
                            'Discarded sentence longer than max_sent_length:',
                            len(lines),
                            file=sys.stderr)
                        lines = []
                        continue
                    sentences.append(Sentence(fields, lines))
                    lines = []
                else:
                    if not line.startswith('#'):
                        # append empty columns
                        line += '\t_' * (len(CoNLL._fields) -
                                         len(line.split('\t')))
                    lines.append(line)

        return cls(fields, sentences)
Example #21
0
    def predict(self,
                X,
                part_of_speech=None,
                tagger_preprocessed=False,
                sentence_level=False):

        i = 0
        if sentence_level:
            results = []
            for text in X:
                tokenizer = Tokenizer()
                sentences = tokenizer.tokenize([text])
                sentences = [
                    " ".join([token[0] for token in sentence])
                    for sentence in sentences
                ]

                preprocessed_sentences = self.preprocess_texts(
                    sentences,
                    part_of_speech=part_of_speech,
                    tagger_preprocessed=tagger_preprocessed)

                X = self.vectorizer.transform(preprocessed_sentences).toarray()
                pred = self.nb_model.predict(X)
                results.append(int(round(np.mean(pred))))
                print(i)
                i += 1
            return np.array(results)

        else:
            preprocessed = self.preprocess_texts(
                X,
                part_of_speech=part_of_speech,
                tagger_preprocessed=tagger_preprocessed)
            X = self.vectorizer.transform(preprocessed).toarray()
            return self.nb_model.predict(X)
Example #22
0
    print(' tokenizer algorithm     : '+str(args.tokenizer))
    if args.v is True:
        print(' verbose (print debug)   : ON')

    # output directory
    if not os.path.exists(args.p):
        os.mkdir(args.p)

    # (0) torch settings
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = True
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # (1) tokenizer setting
    tokenizer = Tokenizer(args.tokenizer, '../tokenizer/e2e.model')

    # (2) corpus data
    random.seed(args.seed)
    lex_flag = True
    dataset_train = MyDataset(args.corpus.rstrip('/')+'/e2e_train.json',
                              args.corpus.rstrip('/')+'/e2e_valid.json',
                              args.corpus.rstrip('/')+'/e2e_test.json',
                              args.corpus.rstrip('/')+'/e2e-augment/e2e_mr_lex_max_num_token.json',
                              'train', tokenizer, lex_flag, device)
    dataset_valid = MyDataset(args.corpus.rstrip('/')+'/e2e_train.json',
                              args.corpus.rstrip('/')+'/e2e_valid.json',
                              args.corpus.rstrip('/')+'/e2e_test.json',
                              args.corpus.rstrip('/')+'/e2e-augment/e2e_mr_lex_max_num_token.json',
                              'valid', tokenizer, lex_flag, device)
    dataset_test = MyDataset(args.corpus.rstrip('/')+'/e2e_train.json',
# -*- encoding: utf8 -*-
import re
import requests
import unicodedata
from tokenizer.tokenizer import Tokenizer
from sklearn.externals import joblib
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from pyvi.pyvi import ViTokenizer
from sklearn.metrics import confusion_matrix

tokenizer = Tokenizer()
tokenizer.run()


def load_model(model):
    print('loading model ...', model)
    if os.path.isfile(model):
        return joblib.load(model)
    else:
        return None


def list_words(mes):
    words = mes.lower().split()
    return " ".join(words)

def main():
    dataset = data_loader.load_text_file("data_2.txt")
    tokenizer = Tokenizer()
    output = tokenizer.tokenize([dataset])
    for sentence in output:
        print(sentence)
Example #25
0
tf.import_graph_def(restored_graph_def,
                    input_map=None,
                    return_elements=None,
                    name="")

graph = tf.get_default_graph()
doc_ids = graph.get_tensor_by_name('doc_ids:0')
doc_mask = graph.get_tensor_by_name('doc_mask:0')
doc_type = graph.get_tensor_by_name('doc_type:0')
#content = graph.get_tensor_by_name('content:0')
doc_output = graph.get_tensor_by_name('doc/output:0')

doc_max_length = 12

s = Simplifier('tokenizer/zh_mapping.txt')
t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model',
              'tokenizer/lg.all.voc', doc_max_length)
count = 0
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
#f = ['a real test', 'b false test']
with open(infile, 'r', encoding='utf-8') as f, open(outfile,
                                                    'w',
                                                    encoding='utf-8') as fo:
    with tf.Session(config=config) as sess:
        time = datetime.datetime.now()
        for line in f:
            simple = s.simplify(line)
            tokens = t.tokenize(simple)
            accents = run_strip_accents(tokens)
            ids = t.token_to_id(accents)
Example #26
0
 def test_tokenize(self):
     tokenizer = Tokenizer(self.args['lang'])
     sentences = tokenizer.predict(
         'Ha chiamato il dr. Rossi.Vuole salutarti.')
     self.assertEqual(len(sentences), 2)
Example #27
0
 def test_tokenize(self):
     tokenizer = Tokenizer(**self.args)
     sentences = tokenizer.predict(
         'Domani vorrei andare al mare.Speriamo faccia bel tempo.')
     self.assertEqual(len(sentences), 2)
Example #28
0
def genStats(path, helpers):
    t = Tokenizer(path)
    tokens = t.raw_tokenize()

    # stats
    numLines = 0
    numWhitespace = 0
    numComments = 0
    avgIdentLength = 0
    numFunctions = 0  # ident followed by (, declarations and calls
    numDefines = 0
    numMathOps = 0
    lenLongestLine = 0
    numReturns = 0

    # other data
    idents = []
    text = io.readFile(path)
    lastWasIdent = False

    # get info from tokens
    for token in tokens:
        # look for a comment
        if token.kind.name == "COMMENT":
            numComments += 1

        # look for math ops
        if token.spelling in [
                "+", "-", "*", "/", "|", "&", "+=", "-=", "*=", "/=", ">>=",
                "<<=", "++", "--", "~", ">>", "!"
        ]:
            numMathOps += 1

        # look for function decs/calls
        if lastWasIdent and token.spelling == "(":
            numFunctions += 1

        # count the number of returns
        if token.spelling == "return":
            numReturns += 1

        # add the identifier to the list, set lastWasIdent
        if token.kind.name == "IDENTIFIER":
            idents.append(token.spelling)
            lastWasIdent = True
        else:
            lastWasIdent = False

    # get average ident length
    total = 0.0
    for ident in idents:
        total += float(len(ident))
    avgIdentLength = 0.0
    if len(idents) > 0:
        avgIdentLenth = total / float(len(idents))

    # find the number of defines
    defines = re.findall("#\s*define ", text.lower())
    numDefines = len(defines)

    # find the number of lines
    lines = text.split("\n")
    if len(lines) == 1:
        # ugh, windows
        lines = text.split("\r")
    numLines = len(lines)

    # get the length of the longest line
    for line in lines:
        if len(line) > lenLongestLine:
            lenLongestLine = len(line)

    # find the total amount of whitespace
    for char in text:
        if char in [" ", "\n", "\t", "\r"]:
            numWhitespace += 1

    # create a dict of results and return
    results = {}
    results["numLines"] = numLines
    results["numWhitespace"] = numWhitespace
    results["numComments"] = numComments
    results["avgIdentLength"] = avgIdentLength
    results["numFunctions"] = numFunctions
    results["numDefines"] = numDefines
    results["numMathOps"] = numMathOps
    results["numReturns"] = numReturns
    results["lenLongestLine"] = lenLongestLine
    return results
Example #29
0
    print('startword: ' + startword)

    print('** output **')
    # MR(LEX) -> TXT_LEX
    nlg_txt, attention_txt = NLG_model.convert_nlg(mr_obj, args.search,
                                                   lex_flag, startword)
    obj_txt = {'txt': nlg_txt}
    print('TXT: ' + obj_txt['txt'])

    #print(obj_txt)
    f = open(args.o, 'w', encoding='utf-8')
    json.dump(obj_txt, f, ensure_ascii=False, indent=4, sort_keys=False)
    f.close()
    print('** done **')

    tokenizer = Tokenizer('nltk', '../../tokenizer/e2e.model')

    txt = nlg_txt
    mr = mr_obj
    if read_mr_obj['name'] != '':
        txt = txt.replace(read_mr_obj['name'], 'NAME')
        mr['name'] = 'NAME'
    if read_mr_obj['near'] != '':
        txt = txt.replace(read_mr_obj['near'], 'NEAR')
        mr['near'] = 'NEAR'

    mr_token = tokenizer.mr(mr)
    txt_token = tokenizer.txt(txt)
    '''
    print(mr_token)
    print(str(len(mr_token)))
Example #30
0
import pickle
from bm25 import BM25

# Special vocabulary symbols - we always put them at the start.
_PAD = b"_PAD"
_GO = b"_GO"
_EOS = b"_EOS"
_UNK = b"_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

default_tokenizer = Tokenizer(_UNK)
bpe_tokenizer = BPETokenizer(
    open("/home/martin/projects/subword-nmt/vocab_bpe_merged"), _START_VOCAB)


def create_vocabulary(vocabulary_path,
                      data_path,
                      max_vocabulary_size,
                      dataset_reader,
                      tokenizer=default_tokenizer,
                      persist_counts=False):
    """Create vocabulary file (if it does not exist yet) from data file.

    Data file is assumed to contain one sentence per line. Each sentence is
    tokenized and digits are normalized (if normalize_digits is set).
    Vocabulary contains the most-frequent tokens up to max_vocabulary_size.