Beispiel #1
0
def classifier_process(i, jobs_queue, output_queue, args):
    with MosesTokenizer(args.source_lang) as source_tokenizer, MosesTokenizer(args.target_lang) as target_tokenizer:
        while True:
            job = jobs_queue.get()
            if job:
                logging.debug("Job {0}".format(job.__repr__()))
                nblock, filein_name = job
                ojob = None
                with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False, dir=args.tmp_dir) as fileout:
                    logging.debug("Classification: creating temporary filename {0}".format(fileout.name))
                    feats = []
                    # TODO Test times with predict one-by-one and this impl
                    for i in filein:
                        features = feature_extract(i, source_tokenizer, target_tokenizer, args)
                        feats.append([float(v) for v in features])

                    if len(feats) > 0:
                        prediction = args.clf.predict_proba(np.array(feats))

                        row = 0
                        for pred in prediction:
                            fileout.write("{}\n".format(str(pred[1])))
                            row += 1
                    
                    ojob = (nblock, fileout.name)
                    filein.close()
                    fileout.close()
                if ojob:
                    output_queue.put(ojob)
                    
                os.unlink(filein_name)
            else:
                logging.debug("Exiting worker")
                break
Beispiel #2
0
def print_unrolled_stats(unrolled_data):
    counter = dict()
    sentiment_counter = defaultdict(int)
    length_list = []
    tk = MosesTokenizer()

    aspects = set()
    for x in unrolled_data:
        aspects.add(x['aspect'])
    for a in aspects:
        counter[a] = defaultdict(int)
    for e in unrolled_data:
        counter[e['aspect']][e['sentiment']] += 1
        length_list.append(len(tk.tokenize((e['sentence']))))
    for aspect in sorted(counter.keys()):
        total = 0
        for sentiment in sorted(counter[aspect].keys()):
            print('# {}\t\t{}:\t{}'.format(aspect, sentiment,
                                           counter[aspect][sentiment]))
            total += counter[aspect][sentiment]
            sentiment_counter[sentiment] += counter[aspect][sentiment]
        counter[aspect]['total'] = total
        print('# {}\t\t{}:\t{}'.format(aspect, 'total', total))
        print()
    print(sentiment_counter)
    return counter
Beispiel #3
0
def worker_process(i, jobs_queue, output_queue, args):
    with MosesTokenizer(args.source_lang) as tokl, \
         MosesTokenizer(args.target_lang) as tokr:
        while True:
            job = jobs_queue.get()
            if job:
                logging.debug("Job {}".format(job.__repr__()))
                nblock, filein_name, label = job

                with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False) as fileout:
                    logging.debug("Filtering: creating temporary file {}".format(fileout.name))
                    for i in filein:
                        features = feature_extract(i, tokl, tokr, args)
                        
                        for j in features:
                            fileout.write("{}".format(j))
                            fileout.write("\t")
                        fileout.write("{}".format(label))
                        fileout.write("\n")
                    ojob = (nblock, fileout.name)
                    fileout.close()
                    filein.close()
                    output_queue.put(ojob)
                os.unlink(filein_name)
            else:
                logging.debug("Exiting worker")
                break
Beispiel #4
0
def classifier_process(i, jobs_queue, output_queue, args):
    with MosesTokenizer(args.source_lang) as source_tokenizer, MosesTokenizer(
            args.target_lang) as target_tokenizer:
        while True:
            job = jobs_queue.get()
            if job:
                logging.debug("Job {0}".format(job.__repr__()))
                nblock, filein_name = job
                with open(filein_name, 'r') as filein, NamedTemporaryFile(
                        mode="w", delete=False, dir=args.tmp_dir) as fileout:
                    logging.debug(
                        "Classification: creating temporary filename {0}".
                        format(fileout.name))
                    feats = []
                    temp_lines = []
                    # TODO Test times with predict one-by-one and this impl
                    for i in filein:
                        parts = i.strip().split("\t")
                        line = ""
                        temp_lines.append(i)
                        if len(parts) == 7:
                            # Last two columns are the language pair
                            if parts[-2] == args.source_lang and parts[
                                    -1] == args.target_lang:
                                line = "{}\t{}\n".format(parts[1], parts[3])
                            elif parts[-1] == args.source_lang and parts[
                                    -2] == args.source_lang:
                                line = "{}\t{}\n".format(parts[3], parts[1])
                            features = feature_extract(line, source_tokenizer,
                                                       target_tokenizer, args)
                            feats.append([float(v) for v in features])
                        else:
                            logging.debug(
                                "Line not included in process: {}".format(i))

                    if len(feats) > 0:
                        prediction = args.clf.predict_proba(np.array(feats))

                        row = 0
                        for pred in prediction:
                            while not temp_lines[row].startswith("<tu "):
                                fileout.write(temp_lines[row])
                                row += 1
                            fileout.write("{}\t{}\n".format(
                                temp_lines[row].strip("\n"), str(pred[1])))
                            row += 1
                    else:
                        for l in temp_lines:
                            fileout.write(l)

                    ojob = (nblock, fileout.name)
                    filein.close()
                    fileout.close()
                    output_queue.put(ojob)

                os.unlink(filein_name)
            else:
                logging.debug("Exiting worker")
                break
Beispiel #5
0
    def __init__(self):
        from mosestokenizer import MosesTokenizer

        self.tokenizer = MosesTokenizer('ru')
        # disable
        self.tokenizer.argv.append('-no-escape')  # " -> &quot;
        self.tokenizer.argv.remove('-a')  # - -> @-@
        self.tokenizer.restart()
Beispiel #6
0
def own_bleu_score(predictions, references, max_order=4, smooth=False):
    '''
    reference_corpus = []
    prediction_corpus = []
    for instance_id, reference_sents in references.items():
        try:
            prediction_sent = predictions[instance_id]
        except KeyError:
            logging.error("Missing prediction for instance '%s'.", instance_id)
            sys.exit(EXIT_STATUS_PREDICTION_MISSING)

        del predictions[instance_id]

        prediction_corpus.append(prediction_sent)
        reference_corpus.append(reference_sents)

    if len(predictions) > 0:
        logging.error("Found %d extra predictions, for example: %s", len(predictions),
                      ", ".join(list(predictions.keys())[:3]))
        sys.exit(EXIT_STATUS_PREDICTIONS_EXTRA)

    reference_length = 0
    translation_length = 0
    scores = []
    counter = 0
    for (references, translation) in zip(reference_corpus, prediction_corpus):
        if counter <= 4:
            print("Referenz: ", references, "\nPrediction: ", translation, "\n")
        counter += 1
        scores.append(sentence_bleu(references, translation, weights=(0,0,0,1)))
    '''
    # to be able to load punkt tokenizer from local folder even if on cluster
    original_dir = os.getcwd()
    execution_dir = os.path.dirname(os.path.abspath(__file__))
    os.chdir(execution_dir)
    '''
    compl_ref = ""
    for ref in references:
        compl_ref += ref + " "
    
    references = nltk.word_tokenize(compl_ref)
    '''

    #predictions = nltk.word_tokenize(predictions[0].strip('.'))
    tokenizer = MosesTokenizer('en')
    predictions = tokenizer.tokenize(predictions[0].lower())
    references = [
        tokenizer.tokenize(reference.lower()) for reference in references
    ]
    # change directory back after nltk tokenizers have been applied
    os.chdir(original_dir)
    # original bleu score uses constant weights
    #print(references[0])
    #scores = corpus_bleu([references], [predictions])
    scores = sentence_bleu(references, predictions, weights=(0.33, 0.33, 0.33))
    return scores
Beispiel #7
0
def read_sentence14_target(file_path, max_offset_len=83):
    tk = MosesTokenizer()
    with open(file_path, 'rb') as fopen:
        raw = fopen.read()
        root = etree.fromstring(raw)
        for sentence in root:
            example = dict()
            example["sentence"] = sentence.find('text').text.lower()

            # for RAN
            tokens = tk.tokenize(example['sentence'])

            terms = sentence.find('aspectTerms')
            if terms is None:
                continue
            example["aspect_sentiment"] = []
            example["left_right"] = []
            example['offset'] = []

            for c in terms:
                target = c.attrib['term'].lower()
                example["aspect_sentiment"].append(
                    (target, c.attrib['polarity']))

                # for td lstm
                left_index = int(c.attrib['from'])
                right_index = int(c.attrib['to'])
                example["left_right"].append(
                    (example['sentence'][:right_index],
                     example['sentence'][left_index:], c.attrib['polarity']))

                # for RAN
                left_word_offset = len(
                    tk.tokenize(example['sentence'][:left_index]))
                right_word_offset = len(
                    tk.tokenize(example['sentence'][right_index:]))
                token_index = list(range(len(tokens)))
                token_length = float(len(token_index))
                for i in range(len(tokens)):
                    if i < left_word_offset:
                        token_index[i] = 1 - (left_word_offset -
                                              token_index[i]) / token_length
                    elif i >= right_word_offset:
                        token_index[i] = 1 - (token_index[i] -
                                              (len(tokens) - right_word_offset)
                                              + 1) / token_length
                    else:
                        token_index[i] = 0
                token_index += [-1.] * (max_offset_len - len(tokens))
                example['offset'].append(
                    (token_index, target, c.attrib['polarity']))
            yield example
Beispiel #8
0
def moses_tokenize(text):
    from mosestokenizer import MosesTokenizer

    global MOSES_TOK
    if not MOSES_TOK:
        MOSES_TOK = MosesTokenizer('ru')
        # disable
        MOSES_TOK.argv.append('-no-escape')  # " -> &quot;
        MOSES_TOK.argv.remove('-a')  # - -> @-@
        MOSES_TOK.restart()

    chunks = MOSES_TOK(text)
    return find_substrings(chunks, text)
Beispiel #9
0
class WordTokenizer(BaseTokenizer):
    def __init__(self):
        self.tokenizer = MosesTokenizer()

    def tokenize(self, text: str) -> List[str]:
        return self.tokenizer(text.strip())

    def detokenize(self, tokens: List[str]) -> str:
        text = " ".join(tokens).strip()
        return text

    def close(self):
        self.tokenizer.close()
Beispiel #10
0
def print_unrolled_stats_atsa(unrolled_data):
    counter = defaultdict(int)
    length_list = []
    tk = MosesTokenizer()

    for e in unrolled_data:
        counter[e['sentiment']] += 1
        length_list.append(len(tk.tokenize((e['sentence']))))

    for sentiment in sorted(counter.keys()):
        print('#{}:\t{}'.format(sentiment, counter[sentiment]))

    return counter
Beispiel #11
0
class MosesTokenizer:
    label = 'mosestokenizer'

    def __init__(self):
        from mosestokenizer import MosesTokenizer

        self.tokenizer = MosesTokenizer('ru')
        # disable
        self.tokenizer.argv.append('-no-escape')  # " -> &quot;
        self.tokenizer.argv.remove('-a')  # - -> @-@
        self.tokenizer.restart()

    def __call__(self, text):
        chunks = self.tokenizer(text)
        return find_substrings(chunks, text)
    def __init__(self,
                 server,
                 servable_name,
                 t2t_usr_dir,
                 problem,
                 data_dir,
                 timeout_secs):
        super(EnZhNmtClient).__init__()
        tf.logging.set_verbosity(tf.logging.INFO)
        validate_flags(server, servable_name)
        usr_dir.import_usr_dir(t2t_usr_dir)
        self.problem = registry.problem(problem)
        self.hparams = tf.contrib.training.HParams(
            data_dir=os.path.expanduser(data_dir))
        self.problem.get_hparams(self.hparams)
        self.request_fn = make_request_fn(server, servable_name, timeout_secs)
        self.moses_tokenizer = MosesTokenizer('en')
        self.moses_detokenizer = MosesDetokenizer('zh')
        if problem.endswith("_rev"):
            fname = "targets"
        else:
            fname = "inputs" if self.problem.has_inputs else "targets"
        self.input_encoder = self.problem.feature_info[fname].encoder

        if problem.endswith("_rev"):
            self.output_decoder = self.problem.feature_info["inputs"].encoder
        else:
            self.output_decoder = self.problem.feature_info["targets"].encoder
Beispiel #13
0
def kazakh_lemma_tokenizer(sent):
    klt = KazakhLemmatizer()
    tokens = []
    with MosesTokenizer('kk') as tokenize:
        for token in tokenize(sent):
            tokens.append(klt.lemmatize(token))
    return tokens
Beispiel #14
0
def classifier_process(i, jobs_queue, output_queue, args):
    with MosesTokenizer(args.source_lang) as source_tokenizer, MosesTokenizer(args.target_lang) as target_tokenizer:
        while True:
            job = jobs_queue.get()
            if job:
                logging.debug("Job {0}".format(job.__repr__()))
                nblock, filein_name = job
                ojob = None
                with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False, dir=args.tmp_dir) as fileout:
                    logging.debug("Classification: creating temporary filename {0}".format(fileout.name))
                    feats = []

                    for i in filein:
                        parts = i.split("\t")
                        if len(parts) >= 2 and len(parts[0].strip()) != 0 and len(parts[1].strip()) != 0:
                            features = feature_extract(i, source_tokenizer, target_tokenizer, args)
                            # print("SENTENCE PAIR: %%{}%%".format(i))
                            # print(Features(features)) # debug
                            feats.append([float(v) for v in features])

                    predictions = args.clf.predict_proba(np.array(feats)) if len(feats) > 0 else []
                    filein.seek(0)

                    piter = iter(predictions)
                    for i in filein:
                        parts = i.split("\t")
                        if len(parts) >= 2 and len(parts[0].strip()) != 0 and len(parts[1].strip()) != 0:
                            p = next(piter)
                            fileout.write(i.strip())
                            fileout.write("\t")
                            fileout.write(str(p[1]))
                            fileout.write("\n")
                        else:
                            fileout.write(i.strip("\n"))
                            fileout.write("\t0\n")

                    ojob = (nblock, fileout.name)
                    filein.close()
                    fileout.close()
                 
                if ojob:                    
                    output_queue.put(ojob)
                    
                os.unlink(filein_name)
            else:
                logging.debug("Exiting worker")
                break
Beispiel #15
0
 def __init__(self,
              language_code='pt',
              nltk_stop_words_package='portuguese'):
     self.tokenize = MosesTokenizer(language_code)
     nltk.download('wordnet', quiet=False)
     self.lemmatizer = nltk.stem.WordNetLemmatizer()
     nltk.download(info_or_id='stopwords', quiet=False)
     self.stop_words = nltk.corpus.stopwords.words(nltk_stop_words_package)
Beispiel #16
0
def worker_process(i, jobs_queue, output_queue, args):
    if args.source_tokeniser_path:
        source_tokeniser = ToolWrapper(args.source_tokeniser_path.split(' '))
    else:
        source_tokeniser = MosesTokenizer(args.source_lang)
    if args.target_tokeniser_path:
        target_tokeniser = ToolWrapper(args.target_tokeniser_path.split(' '))
    else:
        target_tokeniser = MosesTokenizer(args.target_lang)
    while True:
        job = jobs_queue.get()
        if job:
            logging.debug("Job {}".format(job.__repr__()))
            nblock, filein_name, label = job

            with open(filein_name, 'r') as filein, NamedTemporaryFile(
                    mode="w", delete=False) as fileout:
                logging.debug("Filtering: creating temporary file {}".format(
                    fileout.name))
                for i in filein:
                    srcsen, trgsen = i.split("\t")[:2]
                    trgsen = trgsen.strip()
                    #                    print(str(srcsen) + " --- " + str(trgsen))
                    features = feature_extract(srcsen, trgsen,
                                               source_tokeniser,
                                               target_tokeniser, args)

                    for j in features:
                        fileout.write("{}".format(j))
                        fileout.write("\t")
                    fileout.write("{}".format(label))
                    fileout.write("\n")
                ojob = (nblock, fileout.name)
                fileout.close()
                filein.close()
                output_queue.put(ojob)
            os.unlink(filein_name)
        else:
            logging.debug("Exiting worker")
            source_tokeniser.close()
            target_tokeniser.close()
            break
Beispiel #17
0
    def __init__(self, user_dict, src_vacob_model, tgt_vocab_model, server,
                 servable_name, timeout_secs):
        tf.logging.set_verbosity(tf.logging.INFO)
        self.src_encoder = SpmTextEncoder(src_vacob_model)
        self.tgt_encoder = SpmTextEncoder(tgt_vocab_model)

        self.en_tokenizer = MosesTokenizer('en')
        jieba.load_userdict(user_dict)
        self.request_fn = make_request_fn(server, servable_name, timeout_secs)
        super(EnZhBertAlignClient,
              self).__init__(src_vacob_model, tgt_vocab_model, server,
                             servable_name, timeout_secs)
Beispiel #18
0
    def __init__(self, src_vacob_model, tgt_vocab_model, server, servable_name,
                 timeout_secs):
        tf.logging.set_verbosity(tf.logging.INFO)
        self.src_encoder = SpmTextEncoder(src_vacob_model)
        self.tgt_encoder = SpmTextEncoder(tgt_vocab_model)

        self.en_tokenizer = MosesTokenizer('en')
        self.mecab = MeCab.Tagger("-Owakati")
        self.request_fn = make_request_fn(server, servable_name, timeout_secs)
        super(EnJaBertAlignClient,
              self).__init__(src_vacob_model, tgt_vocab_model, server,
                             servable_name, timeout_secs)
 def __init__(self):
     tf.logging.set_verbosity(tf.logging.INFO)
     validate_flags()
     usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
     self.problem = registry.problem(FLAGS.problem)
     self.hparams = tf.contrib.training.HParams(
         data_dir=os.path.expanduser(FLAGS.data_dir))
     self.problem.get_hparams(self.hparams)
     self.request_fn = make_request_fn()
     self.tokenizer = MosesTokenizer('en')
     self.moses_detokenizer = MosesDetokenizer('zh')
     self.delimiter = re.compile("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s")
def preprocess(inputs):
    sentences, max_tokens, lang = inputs
    tokenizer = MosesTokenizer(lang)

    result = []
    for sent in sentences:
        words = tokenizer(sent.strip())
        if len(words) > max_tokens:
            continue
        else:
            result.append(" ".join(words) + "\n")

    return result
Beispiel #21
0
def preproc_europarl(args):
    """
    - tokenization
    - lower case
    - sub digit with 0
    - remove all punctuations
    - remove redundant spaces and emtpy lines
    - (optional) cut long sentences to a reasonable length
  """
    langs = args.input[args.input.rfind('.') + 1:].strip().split('-')
    # only 2 languages
    assert (len(langs) == 2)
    lang1, lang2 = langs
    tokenizer1 = MosesTokenizer(lang1)
    tokenizer2 = MosesTokenizer(lang2)
    # read corpus
    with open(args.input + '.{}'.format(lang1), 'r') as fin1, \
         open(args.input + '.{}'.format(lang2), 'r') as fin2:
        text1 = fin1.readlines()
        text2 = fin2.readlines()
    assert (len(text1) == len(text2))

    with open(args.input + '.{}.preproc'.format(lang1), 'w') as fout1, \
         open(args.input + '.{}.preproc'.format(lang2), 'w') as fout2:
        for i, line1 in tqdm(enumerate(text1), total=len(text1)):
            # each line is a sentence
            line1 = line1.strip()
            line2 = text2[i].strip()

            line1 = preproc_text(line1, tokenizer1)
            line2 = preproc_text(line2, tokenizer2)

            # remove emtpy lines
            if not line1 or not line2:
                continue

            fout1.write(line1 + '\n')
            fout2.write(line2 + '\n')
def process_corpus(embeddings_dictionary, corpus, vectors, language):
    """
    Cleans corpus using the dictionary of embeddings.
    Any word without an associated embedding in the dictionary is ignored.
    Adds '__target-language' and '__source-language' at the end
    of the words according to their language.
    """
    clean_corpus, clean_vectors, keys = [], {}, []
    words_we_want = set(embeddings_dictionary)
    tokenize = MosesTokenizer(language)
    for key, doc in enumerate(corpus):
        clean_doc = []
        words = tokenize(doc)
        for word in words:
            if word in words_we_want:
                clean_doc.append(word + "__%s" % language)
                clean_vectors[word + "__%s" % language] = np.array(
                    vectors[word].split()).astype(np.float)
        if len(clean_doc) > 3 and len(clean_doc) < 25:
            keys.append(key)
        clean_corpus.append(" ".join(clean_doc))
    tokenize.close()
    return np.array(clean_corpus), clean_vectors, keys
def clean_corpus_suffix(corpus, language):
    """
    Adds '__target-language' and '__source-language' at the end of the words
    """
    clean_corpus = []
    tokenize = MosesTokenizer(language)
    for definition in corpus:
        definition = sub(r"'", "", definition)
        definition = sub(r"[^\w]", " ", definition)
        clean_doc = []
        words = tokenize(definition)
        for word in words:
            clean_doc.append(word + "__%s" % language)
        clean_corpus.append(" ".join(clean_doc))
    return clean_corpus
Beispiel #24
0
def get_tokenizer(cmd, lang="en"):
    if cmd == "moses":
        return MosesTokenizer(lang)
    elif cmd == "mecab":
        tagger = MeCab.Tagger("-Owakati")

        def mecab(text):
            return tagger.parse(text).strip().split()
        return mecab
    else:
        proc = ExternalTextProcessor(cmd.split())

        def external(text):
            return proc.process(text).strip().split()
        return external
Beispiel #25
0
class MosesTokenizer(Tokenizer):
    def __init__(self):
        super().__init__()
        self._tokenizer = NLTKMosesTokenizer()
        self._detokenizer = MosesDetokenizer()

    def tokenize(self, sentence):
        return self._tokenizer.tokenize(sentence)

    def detokenize(self, tokens):
        """Unescape Moses punctuation tokens.

        Replaces escape sequences like &#91; with the original characters
        (such as '['), so they better align to the original text.
        """
        return [self._detokenizer.unescape_xml(t) for t in tokens]
Beispiel #26
0
def get_tokenizer(tokenizer_name):
    log.info(f"\tLoading Tokenizer {tokenizer_name}")
    if tokenizer_name.startswith("bert-"):
        from pytorch_pretrained_bert import BertTokenizer

        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name == "OpenAI.BPE":
        tokenizer = OpenAIBPETokenizer()
    elif tokenizer_name == "MosesTokenizer":
        tokenizer = MosesTokenizer()
    elif tokenizer_name == "":
        tokenizer = SpaceTokenizer()
    else:
        tokenizer = None
    return tokenizer
Beispiel #27
0
    def __init__(self,
                 srclang,
                 targetlang,
                 sourcebpe=None,
                 targetbpe=None,
                 sourcespm=None,
                 targetspm=None):
        self.bpe_source = None
        self.bpe_target = None
        self.sp_processor_source = None
        self.sp_processor_target = None
        self.sentences = []
        # load BPE model for pre-processing
        if sourcebpe:
            # print("load BPE codes from " + sourcebpe, flush=True)
            BPEcodes = open(sourcebpe, 'r', encoding="utf-8")
            self.bpe_source = BPE(BPEcodes)
        if targetbpe:
            # print("load BPE codes from " + targetbpe, flush=True)
            BPEcodes = open(targetbpe, 'r', encoding="utf-8")
            self.bpe_target = BPE(BPEcodes)

        # load SentencePiece model for pre-processing
        if sourcespm:
            # print("load sentence piece model from " + sourcespm, flush=True)
            self.sp_processor_source = sentencepiece.SentencePieceProcessor()
            self.sp_processor_source.Load(sourcespm)
        if targetspm:
            # print("load sentence piece model from " + targetspm, flush=True)
            self.sp_processor_target = sentencepiece.SentencePieceProcessor()
            self.sp_processor_target.Load(targetspm)

        # pre- and post-processing tools
        self.tokenizer = None
        self.detokenizer = None

        # TODO: should we have support for other sentence splitters?
        # print("start pre- and post-processing tools")
        self.sentence_splitter = MosesSentenceSplitter(srclang)
        self.normalizer = MosesPunctuationNormalizer(srclang)
        if self.bpe_source:
            self.tokenizer = MosesTokenizer(srclang)

        if self.bpe_source:
            self.detokenizer = MosesDetokenizer(targetlang)
Beispiel #28
0
def tokenizer_moses(text, column='comment_text'): #column for extracting from csv
    '''
    A proper wrapper for moses text preprocessing utilities,
    because they can't handle newlines
        text: string
        out: list
    '''
    result = []
    with MosesPunctuationNormalizer() as punct, MosesTokenizer('en') as tok:
        if column:
            texts = list(filter(None, text[column].lower().split('\n')))
        else:
            texts = text
        for t in texts:
            if len(t.strip()):
                norm = punct(t)
                tokens = tok(norm)
                result.extend(tokens)
    return result
Beispiel #29
0
    def build_vocabulary(cls, corpus: list=None, file_path: str=None, max_vocab_size=30000, lang='en'):
        vocab = cls(lang=lang, max_vocab_size=max_vocab_size)
        counter = Counter()

        tokenizer = MosesTokenizer(lang=lang)

        if file_path is not None:
            with open(file_path, "rt") as f:
                # TODO: Make preprocessor
                corpus = f.readlines()

        for sentence in tqdm(corpus, desc="Build vocabulary"):
            words = tokenizer(sentence.strip())
            counter.update(words)

        for index, (k, v) in enumerate(counter.most_common(max_vocab_size - len(basic_tokens))):
            vocab.dictionary[index + len(basic_tokens)] = k

        vocab.reversed_dictionary = dict(zip(vocab.dictionary.values(), vocab.dictionary.keys()))
        return vocab
Beispiel #30
0
 def __init__(self,
              mosestokenizer_language_code="en",
              store_data=False,
              spell_checker_lang=None,
              n_jobs=1):
     self.mosestokenizer_language_code = mosestokenizer_language_code
     self.splitsents = MosesSentenceSplitter(
         self.mosestokenizer_language_code)
     self.tokenize = MosesTokenizer(self.mosestokenizer_language_code)
     nltk.download('wordnet', quiet=False)
     self.lemmatizer = nltk.stem.WordNetLemmatizer()
     self.stop = False
     self.store_data = store_data
     if spell_checker_lang is None:
         logger.info("The spell checker is disabled.")
         self.spell_checker = None
     else:
         logger.info("The spell checker is enabled for %s." %
                     (spell_checker_lang))
         self.spell_checker = SpellChecker(language=spell_checker_lang,
                                           n_jobs=n_jobs)