コード例 #1
0
ファイル: indic_norm_tok.py プロジェクト: codedecde/flores
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--indic-nlp-path",
                        required=True,
                        help="path to Indic NLP Library root")
    parser.add_argument("--language", required=True)
    parser.add_argument("--remove-nuktas", default=False, action="store_true")
    parser.add_argument("input", help="input file; use - for stdin")
    args = parser.parse_args()

    try:
        sys.path.extend([
            args.indic_nlp_path,
            os.path.join(args.indic_nlp_path, "src"),
        ])
        from indicnlp.tokenize import indic_tokenize
        from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
    except:
        raise Exception(
            "Cannot load Indic NLP Library, make sure --indic-nlp-path is correct"
        )

    # create normalizer
    factory = IndicNormalizerFactory()
    normalizer = factory.get_normalizer(
        args.language,
        remove_nuktas=args.remove_nuktas,
    )

    # normalize and tokenize
    for line in fileinput.input([args.input],
                                openhook=fileinput.hook_compressed):
        line = normalizer.normalize(line.decode("utf-8", errors="ignore"))
        line = " ".join(indic_tokenize.trivial_tokenize(line, args.language))
        sys.stdout.write(line.encode("utf-8"))
コード例 #2
0
def normalize(ip_file_path, op_file_path, ln):
    with open(ip_file_path, 'r') as f:
        with open(op_file_path, "w") as text_file:
            for line in f:
                remove_nuktas = False
                factory = IndicNormalizerFactory()
                normalizer = factory.get_normalizer(ln)
                output_text = normalizer.normalize(line)
                text_file.write(output_text)
コード例 #3
0
def extract_exclusive_msr_corpus(c0_dir, c1_dir, c0_lang, c1_lang, outdir ): 

    factory=IndicNormalizerFactory()
    l0_normalizer=factory.get_normalizer(lang_code_mapping[c0_lang])
    l1_normalizer=factory.get_normalizer(lang_code_mapping[c1_lang])

    data_cache=defaultdict(lambda : [set(),set()])

    # read corpus 0
    en0_f=codecs.open(c0_dir+'/train.En','r','utf-8')
    l0_f=codecs.open(c0_dir+'/train.'+c0_lang,'r','utf-8')

    for en_l,c_l in itertools.izip(iter(en0_f),iter(l0_f)): 
        data_cache[en_l.strip()][0].add(l0_normalizer.normalize(c_l.strip()))

    en0_f.close()
    l0_f.close()

    # read corpus 1                
    en1_f=codecs.open(c1_dir+'/train.En','r','utf-8')
    l1_f=codecs.open(c1_dir+'/train.'+c1_lang,'r','utf-8')

    for en_l,c_l in itertools.izip(iter(en1_f),iter(l1_f)): 
        data_cache[en_l.strip()][1].add(l1_normalizer.normalize(c_l.strip()))

    en1_f.close()
    l1_f.close()

    # write the common data

    # from language en to c0 
    xor_f=codecs.open(outdir+'/train.{}-{}'.format('En',c0_lang),'w','utf-8')
    xor_list=[]
    for en_l, other_l_lists in data_cache.iteritems(): 
        if (len(other_l_lists[0]) >0 and len(other_l_lists[1]) == 0): 
            other_l_lists_w=[u''.join(x.split()) for x in other_l_lists[0]]
            xor_list.append(u''.join(en_l.split()) + u'|' + u'^'.join(other_l_lists_w)+u'\n')

    random.shuffle(xor_list)
    for wr in xor_list: 
        xor_f.write(wr)

    xor_f.close()

    # from language en to c1 
    xor_f=codecs.open(outdir+'/train.{}-{}'.format('En',c1_lang),'w','utf-8')
    xor_list=[]
    for en_l, other_l_lists in data_cache.iteritems(): 
        if (len(other_l_lists[0]) ==0 and len(other_l_lists[1]) > 0): 
            other_l_lists_w=[u''.join(x.split()) for x in other_l_lists[1]]
            xor_list.append(u''.join(en_l.split()) + u'|' + u'^'.join(other_l_lists_w)+u'\n')

    random.shuffle(xor_list)
    for wr in xor_list: 
        xor_f.write(wr)

    xor_f.close()
コード例 #4
0
def pre_process_hindi_sentence(line):
    remove_nuktas = False
    factory = IndicNormalizerFactory()
    normalizer = factory.get_normalizer("hi", remove_nuktas)
    line = normalizer.normalize(line)
    line = clean_text(line)
    tokens = list()
    for t in indic_tokenize.trivial_tokenize(line):
        tokens.append(t)
    line = tokens
    line = [word.lower() for word in line]
    line = [word for word in line if not re.search(r'\d', word)]
    line = ' '.join(line)
    return (line)
コード例 #5
0
 def normalize_and_tokenize(self, lang, fname):
     factory = IndicNormalizerFactory()
     normalizer = factory.get_normalizer(lang, remove_nuktas=False)
     tokenized_file = fname.replace('/', '_')
     tokenized_file = os.path.join('/tmp', tokenized_file)
     with open(fname) as istream:
         with open(tokenized_file, 'w+') as ostream:
             for line in istream:
                 line = line.strip()
                 line = normalizer.normalize(line)
                 tokens = tokenize(line, lang=self.src_lang)
                 tokenized_line = ' '.join(tokens)
                 print(tokenized_line, file=ostream)
     return tokenized_file
コード例 #6
0
    def __init__(self, dataset_path, index_path, cache_path):
        '''
        Init class method

        Arguments:
            dataset_path - path to json data
            index_paths - dict that maps language tag to faiss index path
        '''

        self.cache_path = cache_path
        self.lang_map = {
            'HI': 'hi',
            'BE': 'bn',
            'GU': 'gu',
            'OD': 'or',
            'PU': 'pa',
            'EN': 'en',
            'MA': 'mr'
        }
        self.dataset = read_json_file(dataset_path)
        self.index_path = index_path
        print(self.index_path, self.cache_path)
        self.factory = IndicNormalizerFactory()
        self.stemmer = WordNetLemmatizer()
        self.normalizers = self.get_indic_normalizers()
        self.en_stop = set(nltk.corpus.stopwords.words('english'))

        # Dataset params
        self.phrases = list()
        self.targets = list()
        self.src_lang = list()
        self.target_lang = list()
        self.max_seq_length = 128
        self.language_ids = {
            'HI': 0,
            'BE': 1,
            'GU': 2,
            'OD': 3,
            'PU': 4,
            'EN': 5,
            'MA': 6
        }
        self.get_dataset()
コード例 #7
0
{'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w es t </w>': 6, 'w i d es t </w>': 3}
```
3. 
```
{'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est </w>': 6, 'w i d est </w>': 3}
```
This example is taken from [here](https://leimao.github.io/blog/Byte-Pair-Encoding/).

**Create instances of normalizer and tokenizer for english and marathi**
"""

from mosestokenizer import *
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory

factory = IndicNormalizerFactory()
normalizer = factory.get_normalizer("mr")
tokenize = MosesTokenizer('en')

"""**Preprocessing functions for** 
> * English: Lowercase + Tokenize
> * Marathi: Normalize + Tokenize

"""

def preprocess_en(text):
    s = text.lower()
    s = ' '.join(tokenize(s))
    return s

def preprocess_mr(text):
コード例 #8
0
def process(lang, sent):
    normalizer = IndicNormalizerFactory().get_normalizer(lang)
    normalized = normalizer.normalize(sent)
    processed = ' '.join(trivial_tokenize(normalized, lang))
    return processed
コード例 #9
0
    def __init__(self):
        data_dir = os.path.dirname(__file__) + '/data/'
        self.initial_urdu_to_hindi_map = {}
        self.final_urdu_to_hindi_map = {}
        self.urdu_to_hindi_map_pass1 = {}
        self.urdu_to_hindi_map_pass2 = {}

        for map_file in HINDUSTANI_MISC_MAP_FILES:
            df = pd.read_csv(data_dir + map_file, header=None)
            for i in df.columns:
                urdu_letter, roman_letter, hindi_letter = str(
                    df[i][0]).strip(), str(df[i][1]).strip(), str(
                        df[i][2]).strip()
                self.urdu_to_hindi_map_pass1[urdu_letter] = hindi_letter

        for map_file in INITIAL_HINDUSTANI_MAP_FILES:
            df = pd.read_csv(data_dir + map_file, header=None)
            for i in df.columns:
                urdu_letter, roman_letter, hindi_letter = str(
                    df[i][0]).strip(), str(df[i][1]).strip(), str(
                        df[i][2]).strip()
                self.initial_urdu_to_hindi_map[urdu_letter] = hindi_letter

        for map_file in FINAL_HINDUSTANI_MAP_FILES:
            df = pd.read_csv(data_dir + map_file, header=None)
            for i in df.columns:
                urdu_letter, roman_letter, hindi_letter = str(
                    df[i][0]).strip(), str(df[i][1]).strip(), str(
                        df[i][2]).strip()
                self.final_urdu_to_hindi_map[urdu_letter] = hindi_letter

        for map_file in HINDUSTANI_MAIN_MAP_FILES:
            df = pd.read_csv(data_dir + map_file, header=None)
            for i in df.columns:
                urdu_letter, roman_letter, hindi_letter = str(
                    df[i][0]).strip(), str(df[i][1]).strip(), str(
                        df[i][2]).strip()
                self.urdu_to_hindi_map_pass2[urdu_letter] = hindi_letter
                if 'consonants' not in map_file:
                    continue

                # Non-initial forms: Consonant+ا to Consonant+ा
                self.urdu_to_hindi_map_pass2[urdu_letter +
                                             'ا'] = hindi_letter + 'ा'
                if len(urdu_letter) == 1:
                    urdu_shadda, hindi_shadda = urdu_letter + " ّ".strip(
                    ), hindi_letter + '्' + hindi_letter
                    self.urdu_to_hindi_map_pass1[urdu_shadda] = hindi_shadda
                    self.urdu_to_hindi_map_pass1[urdu_shadda +
                                                 'ا'] = hindi_shadda + 'ा'
                    # Note on why it's not in pass-2: پکّا is converted as पक्कअ instead of पक्का (Regex sees shadda char as word boundary?)

        self.initial_urdu_to_hindi_converter = StringTranslator(
            self.initial_urdu_to_hindi_map, match_initial_only=True)
        self.final_urdu_to_hindi_converter = StringTranslator(
            self.final_urdu_to_hindi_map, match_final_only=True)
        self.urdu_to_hindi_converter_pass1 = StringTranslator(
            self.urdu_to_hindi_map_pass1)
        self.urdu_to_hindi_converter_pass2 = StringTranslator(
            self.urdu_to_hindi_map_pass2)

        # Monkey patch: Force ह to map only to ہ (not ھ)
        self.urdu_to_hindi_converter_pass2.reverse_translation_dict['ह'] = 'ہ'
        self.urdu_to_hindi_converter_pass2.reverse_translation_dict[
            'ह' + 'ा'] = 'ہ' + 'ا'
        self.urdu_to_hindi_converter_pass1.reverse_translation_dict[
            'ह्ह'] = 'ہّ'
        self.urdu_to_hindi_converter_pass1.reverse_translation_dict[
            'ह्ह' + 'ा'] = 'ہّ' + 'ا'

        from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
        self.hindi_normalizer = IndicNormalizerFactory().get_normalizer('hi')
コード例 #10
0
def extract_common_msr_corpus(c0_dir, c1_dir, c0_lang, c1_lang, outdir ): 

    factory=IndicNormalizerFactory()
    l0_normalizer=factory.get_normalizer(lang_code_mapping[c0_lang])
    l1_normalizer=factory.get_normalizer(lang_code_mapping[c1_lang])

    data_cache=defaultdict(lambda : [set(),set()])

    # read corpus 0
    en0_f=codecs.open(c0_dir+'/train.En','r','utf-8')
    l0_f=codecs.open(c0_dir+'/train.'+c0_lang,'r','utf-8')

    for en_l,c_l in itertools.izip(iter(en0_f),iter(l0_f)): 
        data_cache[en_l.strip()][0].add(l0_normalizer.normalize(c_l.strip()))

    en0_f.close()
    l0_f.close()

    # read corpus 1                
    en1_f=codecs.open(c1_dir+'/train.En','r','utf-8')
    l1_f=codecs.open(c1_dir+'/train.'+c1_lang,'r','utf-8')

    for en_l,c_l in itertools.izip(iter(en1_f),iter(l1_f)): 
        data_cache[en_l.strip()][1].add(l1_normalizer.normalize(c_l.strip()))

    en1_f.close()
    l1_f.close()

    # write the common data

    # from language c0 to c1
    cc0_1_f=codecs.open(outdir+'/train.{}-{}'.format(c0_lang,c1_lang),'w','utf-8')
    cc0_1_xlit_f=codecs.open(outdir+'/train.{}-{}.xlit'.format(c0_lang,c1_lang),'w','utf-8')
    cc0_1_list=[]
    cc0_1_xlit_list=[]
    for en_l, other_l_lists in data_cache.iteritems(): 
        if len(other_l_lists[0]) >0 and len(other_l_lists[1]) >0 : 
            for c0_str in  other_l_lists[0] :  
                c0_str_w=c0_str.replace(u' ',u'')
                other_l_lists_w=[u''.join(x.split()) for x in other_l_lists[1]]
                if len(c0_str_w)>3:
                    cc0_1_list.append(c0_str_w + u'|' + u'^'.join(other_l_lists_w)+u'\n')
                    cc0_1_xlit_list.append(UnicodeIndicTransliterator.transliterate(c0_str_w,lang_code_mapping[c0_lang],'hi') + 
                            u'|' + 
                            u'^'.join([ UnicodeIndicTransliterator.transliterate(x,lang_code_mapping[c1_lang],'hi')  for x in other_l_lists_w])+u'\n')

    combined_list=zip(cc0_1_list,cc0_1_xlit_list)                   
    random.shuffle(combined_list)
    for wr,wr_xlit in combined_list: 
        cc0_1_f.write(wr)
        cc0_1_xlit_f.write(wr_xlit)

    cc0_1_f.close()
    cc0_1_xlit_f.close()

    # from language c1 to c0
    cc1_0_f=codecs.open(outdir+'/train.{}-{}'.format(c1_lang,c0_lang),'w','utf-8')
    cc1_0_xlit_f=codecs.open(outdir+'/train.{}-{}.xlit'.format(c1_lang,c0_lang),'w','utf-8')
    cc1_0_list=[]
    cc1_0_xlit_list=[]
    for en_l, other_l_lists in data_cache.iteritems(): 
        if len(other_l_lists[1]) >0 and len(other_l_lists[0]) >0 : 
            for c1_str in  other_l_lists[1] :  
                c1_str_w=c1_str.replace(u' ',u'')
                other_l_lists_w=[u''.join(x.split()) for x in other_l_lists[0]]
                if len(c1_str_w)>3:
                    cc1_0_list.append(c1_str_w + u'|' + u'^'.join(other_l_lists_w)+u'\n')
                    cc1_0_xlit_list.append(UnicodeIndicTransliterator.transliterate(c1_str_w,lang_code_mapping[c1_lang],'hi') + 
                            u'|' + 
                            u'^'.join([ UnicodeIndicTransliterator.transliterate(x,lang_code_mapping[c0_lang],'hi')  for x in other_l_lists_w])+u'\n')

    combined_list=zip(cc1_0_list,cc1_0_xlit_list)                   
    random.shuffle(combined_list)
    for wr,wr_xlit in combined_list: 
        cc1_0_f.write(wr)
        cc1_0_xlit_f.write(wr_xlit)

    cc1_0_f.close()
    cc1_0_xlit_f.close()
コード例 #11
0
import pickle
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from indicnlp.tokenize import indic_tokenize

remove_nuktas = False
factory = IndicNormalizerFactory()
normalizer = factory.get_normalizer("hi", remove_nuktas)

hin = open('./model/dataset/en-hi.hi').readlines()
hin = [line.decode('UTF-8') for line in hin]
print(hin[:5])
hin = [normalizer.normalize(line.strip()) for line in hin]

hin = [indic_tokenize.trivial_tokenize(line) for line in hin]
print(hin[:5])

with open("hindi_tokens.txt", "wb") as fp:
    pickle.dump(hin, fp)
# In[ ]:


for path in sys.path:
    print (path)


# In[ ]:


from indicnlp.normalize.indic_normalize import IndicNormalizerFactory

input_text="\u0958 \u0915\u093c"
remove_nuktas=False
factory=IndicNormalizerFactory()
normalizer=factory.get_normalizer("hi",remove_nuktas)
output_text=normalizer.normalize(input_text)

print(output_text)
print('Length before normalization: {}'.format(len(input_text)))
print('Length after normalization: {}'.format(len(output_text)))


# In[ ]:


from indicnlp.normalize.indic_normalize import DevanagariNormalizer
input_text = "अत : इसे बिना टाँके वाला ऑपरेशन भी कहते हैं ।"
factory1=DevanagariNormalizer()
#normalizer1=factory1.get_normalizer("hi",remove_nuktas)
コード例 #13
0
import sentencepiece
from sacremoses import MosesDetokenizer, MosesTokenizer
import sys, os

sys.path.extend([
    "app/modules/indic_nlp_library/src",
])  # coming all the way from app.py
from indicnlp.tokenize import indic_tokenize, indic_detokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory

factory = IndicNormalizerFactory()
normalizer = factory.get_normalizer(
    "ne",
    remove_nuktas=False,
)


def bpencode(sentence, srctolang):
    sp = sentencepiece.SentencePieceProcessor()

    if srctolang == "ne_en":
        sp.Load("app/ne_en_bpe20000/sentencepiece.bpe.model")
    elif srctolang == "en_ne":
        sp.Load("app/en_ne_bpe5000/sentencepiece.bpe.model")

    return " ".join(sp.EncodeAsPieces(sentence))


def detok(sentence, lang):
    if lang == "en":
        return MosesDetokenizer(lang="en").detokenize(sentence.split())
コード例 #14
0
def get_split_algo(lang: str,
                   split_algo: str) -> tp.Callable[[str], tp.Iterable[str]]:
    # get default algorithm if requested
    if split_algo == "default":
        # use best algorithm in function of language
        if lang in LANGS_MOSES:
            split_algo = "moses"
        elif lang in LANGS_INDIC:
            split_algo = "indic"
        elif lang in LANGS_GEEZ:
            split_algo = "geez"
        elif lang in LANGS_KHMER:
            split_algo = "khmer"
        elif lang in LANGS_BURMESE:
            split_algo = "burmese"
        else:
            # use Moses by default (which likely will fall-back to English)
            split_algo = "moses"
        logger.info(f" - default algorithm for {lang} is {split_algo}")

    if split_algo == "none" or lang == "TODO":
        logger.info(" - no sentence splitting")
        return lambda line: [line]

    elif split_algo == "moses":
        if lang in LANGS_MOSES:
            lang = LANGS_MOSES[lang]
            logger.info(
                f" - Moses sentence splitter: using rules for '{lang}'")
        else:
            lang = "en"
            logger.info(
                f" - Moses sentence splitter for {lang}: falling back to {lang} rules"
            )
        splitter = SentenceSplitter(language=lang)
        # non_breaking_prefix_file=non_breaking_prefix_file
        return splitter.split

    elif split_algo == "indic":
        # initialize toolkit (apparently not needed for sentence segmentation)
        if INDIC_NLP_RESOURCES:
            logger.info(" - Initialize Indic NLP toolkit")
            indic_common.set_resources_path(INDIC_NLP_RESOURCES)
            indic_loader.load()
        if lang in LANGS_INDIC:
            lang = LANGS_INDIC[lang]
            logger.info(
                f" - Indic sentence splitter: using rules for '{lang}'")
        else:
            lang = "hi"
            logger.info(
                f" - Indic sentence splitter for {lang}: falling back to {lang} rules"
            )

        # setup normalizer
        factory = IndicNormalizerFactory()
        indic_normalizer = factory.get_normalizer(lang)

        def split_indic(line: str) -> tp.Iterable[str]:
            """Split Indian text into sentences using Indic NLP tool."""
            line = indic_normalizer.normalize(line)
            for sent in indic_sent_tok.sentence_split(line, lang=lang):
                yield sent

        return split_indic

    elif split_algo == "laonlp":
        logger.info(f" - LaoNLP sentence splitter applied to '{lang}'")
        return lao_sent_tok

    elif split_algo == "khmer":
        logger.info(f" - Khmer NLTK sentence splitter applied to '{lang}'")
        return khm_sent_tok

    elif split_algo == "bodnlp":
        logger.info(f" - Tibetan NLTK sentence splitter applied to '{lang}'")
        return bod_sent_tok

    elif split_algo == "geez":
        logger.info(
            f" - Ge'ez rule-based sentence splitter applied to '{lang}'")
        return split_geez

    elif split_algo == "burmese":
        logger.info(
            f" - Burmese rule-based sentence splitter applied to '{lang}'")
        return split_burmese

    else:
        logger.error(f"Unknown splitting algorithm {split_algo}")

    return None
コード例 #15
0
ファイル: hindi.py プロジェクト: AvijitGhosh82/Tweets_NLP
# -*- coding: utf-8 -*-

# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME="/Users/Avijit/Documents/nlp_lib"
# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES="/Users/Avijit/Documents/nlp_res"

from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)

from indicnlp import loader
loader.load()


from indicnlp.normalize.indic_normalize import IndicNormalizerFactory

input_text=u"\u0958 \u0915\u093c"
remove_nuktas=False
factory=IndicNormalizerFactory()
normalizer=factory.get_normalizer("hi",remove_nuktas)
output_text=normalizer.normalize(input_text)

print output_text
print 'Length before normalization: {}'.format(len(input_text))
print 'Length after normalization: {}'.format(len(output_text))
コード例 #16
0
class XLingualTrainDataset_baseline_lstm(Dataset):
    '''
    Reverse dictionary data loader for training
    '''
    def __init__(self, dataset_path, index_path, cache_path):
        '''
        Init class method

        Arguments:
            dataset_path - path to json data
            index_paths - dict that maps language tag to faiss index path
        '''

        self.cache_path = cache_path
        self.lang_map = {
            'HI': 'hi',
            'BE': 'bn',
            'GU': 'gu',
            'OD': 'or',
            'PU': 'pa',
            'EN': 'en',
            'MA': 'mr'
        }
        self.dataset = read_json_file(dataset_path)
        self.index_path = index_path
        print(self.index_path, self.cache_path)
        self.factory = IndicNormalizerFactory()
        self.stemmer = WordNetLemmatizer()
        self.normalizers = self.get_indic_normalizers()
        self.en_stop = set(nltk.corpus.stopwords.words('english'))

        # Dataset params
        self.phrases = list()
        self.targets = list()
        self.src_lang = list()
        self.target_lang = list()
        self.max_seq_length = 128
        self.language_ids = {
            'HI': 0,
            'BE': 1,
            'GU': 2,
            'OD': 3,
            'PU': 4,
            'EN': 5,
            'MA': 6
        }
        self.get_dataset()

    def get_indic_normalizers(self):
        '''
        Get indic nlp normalizers for preprocessing data
        '''
        normalizers = {}
        for lang in self.lang_map:
            if self.lang_map[lang] != "en":
                normalizers[self.lang_map[lang]] = self.factory.get_normalizer(
                    self.lang_map[lang], remove_nuktas=False)

        return normalizers

    def get_dataset(self):

        self.embeddings = vocab.Vectors(name=self.index_path,
                                        cache=self.cache_path)
        self.vocabulary = torchtext.data.Field()

        # Adding pad and unk token
        self.embeddings.stoi[self.vocabulary.pad_token] = len(
            self.embeddings.stoi)
        self.embeddings.vectors[self.embeddings.stoi[
            self.vocabulary.pad_token]] = torch.zeros(300)
        self.embeddings.stoi[self.vocabulary.unk_token] = len(
            self.embeddings.stoi)
        self.embeddings.vectors[self.embeddings.stoi[
            self.vocabulary.unk_token]] = torch.zeros(300)

        for lang in ['en', 'hi', 'gu', 'pa', 'or', 'mr', 'bn']:
            for d in self.dataset:
                if self.lang_map[d["Target_ID"]] == lang:
                    try:
                        # Remove unknown tokens
                        self.targets.append(self.embeddings.vectors[
                            self.embeddings.stoi[d["Target_keyword"]]])
                        self.src_lang.append(self.lang_map[d["Source_ID"]])
                        self.target_lang.append(self.lang_map[d["Target_ID"]])
                        self.phrases.append(d["Source_text"])

                    except KeyError:
                        #print(d["Target_keyword"] + " not found")
                        pass

    def en_tokenizer(self, document):
        '''
        Borrowed preprocessing script from https://stackabuse.com/python-for-nlp-working-with-facebook-fasttext-library/
        '''
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        tokens = [self.stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in self.en_stop]
        tokens = [word for word in tokens if len(word) > 3]

        return tokens

    def indic_tokenizer(self, text, lang):
        '''
        Tokenizer for indic nlp
        '''

        # Tokenize
        tokens = indic_tokenize.trivial_tokenize(text=text, lang=lang)

        # Normalize
        for i in range(len(tokens)):
            tokens[i] = self.normalizers[lang].normalize(tokens[i])

        return tokens

    def preprocessing_data(self, idx, src=True):

        tokens = []

        if src:
            if self.src_lang[idx] != "en":
                tokens = self.indic_tokenizer(self.phrases[idx],
                                              self.src_lang[idx])
            else:
                tokens = self.en_tokenizer(self.phrases[idx])

        t_length = len(tokens)

        if t_length < self.max_seq_length:
            pad_token_length = self.max_seq_length - t_length
            tokens.extend([self.vocabulary.pad_token] * pad_token_length)
        else:
            tokens = tokens[:self.max_seq_length]

        return tokens

    def tokens2tensor(self, tokens):
        '''
        Convert tokens to integer tensors
        '''

        input_id_vector = []

        for t in tokens:
            if self.embeddings.stoi.get(t) is None:
                input_id_vector.append(
                    self.embeddings.stoi[self.vocabulary.unk_token])
            else:
                input_id_vector.append(self.embeddings.stoi[t])

        input_id_vector = torch.tensor(input_id_vector)

        return input_id_vector

    def __getitem__(self, idx):
        '''
        Get item function pytorch

        Arguments:
            idx - text index
        '''

        tokens = self.preprocessing_data(idx, src=True)
        input_idx = self.tokens2tensor(tokens)

        #target = torch.tensor(self.targets[idx])
        target = (self.targets[idx])
        label = torch.ones(target.shape[0], 1)
        return {
            "phrase": {
                'input_ids': input_idx.squeeze(),
            },
            "target": target,
            "label": label
        }

    def __len__(self):
        '''
        Returns length of dataset
        '''

        return len(self.phrases)
コード例 #17
0
def parse_news_2015(infname, outdir, prefix, src_lang, tgt_lang):
    """
        infname: input XML file  
        outdir: output dir  
        prefix: 'test', or 'train' 
        src_lang 
        tgt_lang 
    """

    if not os.path.exists(outdir):
        os.mkdir(outdir)

    # create normalizer
    factory = IndicNormalizerFactory()
    normalizer = factory.get_normalizer(
        lang_code_mapping[tgt_lang]
        if tgt_lang in lang_code_mapping else tgt_lang, False)

    # parser
    tree = ET.parse(infname)
    root = tree.getroot()

    # open files
    srcfile = codecs.open(outdir + '/{}.{}'.format(prefix, src_lang), 'w',
                          'utf-8')
    tgtfile = codecs.open(outdir + '/{}.{}'.format(prefix, tgt_lang), 'w',
                          'utf-8')
    idfile = codecs.open(outdir + '/{}.{}'.format(prefix, 'id'), 'w', 'utf-8')

    # stats
    pairs = 0
    chars_src = 0
    chars_org = 0
    chars_norm = 0

    for name in root:
        srcnode = name.find('SourceName')
        name_id = name.attrib['ID']

        src_text = srcnode.text
        src_words = src_text.split(' ')

        children = None
        if prefix == 'train':
            ## use for training corpus
            children = name.findall('TargetName')
        else:
            # use for test corpus
            children = [name.find('TargetName')]

        for tgtnode in children:
            tgt_id = tgtnode.attrib['ID']
            tgt_text = tgtnode.text
            tgt_words = tgt_text.split(' ')

            # if an input entry contains multiple words

            # Case 1: generate one line per word
            #if len(src_words)==len(tgt_words):
            #    for offsetno, (src_word,tgt_word) in enumerate(zip(src_words,tgt_words)):
            #        srcfile.write(u' '.join(src_word)+'\n')
            #        tgtfile.write(u' '.join(tgt_word)+'\n')
            #        idfile.write('{}_{}_{}\n'.format(name_id,tgt_id,offsetno))

            #        pairs+=1
            #        chars_src+=len(src_word)
            #        chars_org+=len(tgt_word)
            #        if tgt_lang in lang_code_mapping:
            #            tgt_word=normalizer.normalize(tgt_word)
            #        chars_norm+=len(tgt_word)

            # Case 2: generate just a single word

            srcfile.write(u' _ '.join(
                [u' '.join(src_word.upper())
                 for src_word in src_words]) + '\n')
            tgtfile.write(u' _ '.join(
                [u' '.join(tgt_word.upper())
                 for tgt_word in tgt_words]) + '\n')
            idfile.write('{}_{}_{}\n'.format(name_id, tgt_id, 0))

    print '{}|{}|{}|{}|{}|{}|{}'.format(prefix, src_lang, tgt_lang, pairs,
                                        chars_src, chars_org, chars_norm)

    srcfile.close()
    tgtfile.close()
    idfile.close()
コード例 #18
0
import argparse
import os

import sox

from tqdm import tqdm
from joblib import Parallel, delayed
from glob import glob
import string

from indicnlp.tokenize.indic_tokenize import trivial_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory

lang = 'hi'  # because bhojpuri and hindi both use devanagri script, so we are using devanagri normaliser

normalizer = IndicNormalizerFactory().get_normalizer(lang)

pattern_to_remove = '["0-9१-९"\'Z¤ªŸ॰⁄☺]+'


def get_clean_lines(line):
    '''
    Returns line if no foreign character other than pattern is present else returns empty string
    '''
    line = line.strip()

    line = re.sub(
        '[%s]' % re.escape("!\"#$%&\()\'*+,-./:;<=>?@[\\]^_`{|}~‘’“\"ः"), '',
        line)
    if re.search(pattern_to_remove, line):
        return ''
コード例 #19
0
def extract_common_msr_corpus(c0_dir, c1_dir, c0_lang, c1_lang, outdir):

    factory = IndicNormalizerFactory()
    l0_normalizer = factory.get_normalizer(lang_code_mapping[c0_lang])
    l1_normalizer = factory.get_normalizer(lang_code_mapping[c1_lang])

    data_cache = defaultdict(lambda: [set(), set()])

    # read corpus 0
    en0_f = codecs.open(c0_dir + '/train.En', 'r', 'utf-8')
    l0_f = codecs.open(c0_dir + '/train.' + c0_lang, 'r', 'utf-8')

    for en_l, c_l in itertools.izip(iter(en0_f), iter(l0_f)):
        data_cache[en_l.strip()][0].add(l0_normalizer.normalize(c_l.strip()))

    en0_f.close()
    l0_f.close()

    # read corpus 1
    en1_f = codecs.open(c1_dir + '/train.En', 'r', 'utf-8')
    l1_f = codecs.open(c1_dir + '/train.' + c1_lang, 'r', 'utf-8')

    for en_l, c_l in itertools.izip(iter(en1_f), iter(l1_f)):
        data_cache[en_l.strip()][1].add(l1_normalizer.normalize(c_l.strip()))

    en1_f.close()
    l1_f.close()

    # write the common data

    # from language c0 to c1
    cc0_1_f = codecs.open(outdir + '/train.{}-{}'.format(c0_lang, c1_lang),
                          'w', 'utf-8')
    cc0_1_xlit_f = codecs.open(
        outdir + '/train.{}-{}.xlit'.format(c0_lang, c1_lang), 'w', 'utf-8')
    cc0_1_list = []
    cc0_1_xlit_list = []
    for en_l, other_l_lists in data_cache.iteritems():
        if len(other_l_lists[0]) > 0 and len(other_l_lists[1]) > 0:
            for c0_str in other_l_lists[0]:
                c0_str_w = c0_str.replace(u' ', u'')
                other_l_lists_w = [
                    u''.join(x.split()) for x in other_l_lists[1]
                ]
                if len(c0_str_w) > 3:
                    cc0_1_list.append(c0_str_w + u'|' +
                                      u'^'.join(other_l_lists_w) + u'\n')
                    cc0_1_xlit_list.append(
                        UnicodeIndicTransliterator.transliterate(
                            c0_str_w, lang_code_mapping[c0_lang], 'hi') +
                        u'|' + u'^'.join([
                            UnicodeIndicTransliterator.transliterate(
                                x, lang_code_mapping[c1_lang], 'hi')
                            for x in other_l_lists_w
                        ]) + u'\n')

    combined_list = zip(cc0_1_list, cc0_1_xlit_list)
    random.shuffle(combined_list)
    for wr, wr_xlit in combined_list:
        cc0_1_f.write(wr)
        cc0_1_xlit_f.write(wr_xlit)

    cc0_1_f.close()
    cc0_1_xlit_f.close()

    # from language c1 to c0
    cc1_0_f = codecs.open(outdir + '/train.{}-{}'.format(c1_lang, c0_lang),
                          'w', 'utf-8')
    cc1_0_xlit_f = codecs.open(
        outdir + '/train.{}-{}.xlit'.format(c1_lang, c0_lang), 'w', 'utf-8')
    cc1_0_list = []
    cc1_0_xlit_list = []
    for en_l, other_l_lists in data_cache.iteritems():
        if len(other_l_lists[1]) > 0 and len(other_l_lists[0]) > 0:
            for c1_str in other_l_lists[1]:
                c1_str_w = c1_str.replace(u' ', u'')
                other_l_lists_w = [
                    u''.join(x.split()) for x in other_l_lists[0]
                ]
                if len(c1_str_w) > 3:
                    cc1_0_list.append(c1_str_w + u'|' +
                                      u'^'.join(other_l_lists_w) + u'\n')
                    cc1_0_xlit_list.append(
                        UnicodeIndicTransliterator.transliterate(
                            c1_str_w, lang_code_mapping[c1_lang], 'hi') +
                        u'|' + u'^'.join([
                            UnicodeIndicTransliterator.transliterate(
                                x, lang_code_mapping[c0_lang], 'hi')
                            for x in other_l_lists_w
                        ]) + u'\n')

    combined_list = zip(cc1_0_list, cc1_0_xlit_list)
    random.shuffle(combined_list)
    for wr, wr_xlit in combined_list:
        cc1_0_f.write(wr)
        cc1_0_xlit_f.write(wr_xlit)

    cc1_0_f.close()
    cc1_0_xlit_f.close()
コード例 #20
0
def parse_news_2015(infname, 
                    outdir, 
                    prefix, 
                    src_lang, 
                    tgt_lang): 
    """
        infname: input XML file  
        outdir: output dir  
        prefix: 'test', or 'train' 
        src_lang 
        tgt_lang 
    """


    if not os.path.exists(outdir):        
        os.mkdir(outdir)

    # create normalizer
    factory=IndicNormalizerFactory()
    normalizer=factory.get_normalizer( lang_code_mapping[tgt_lang] if tgt_lang in lang_code_mapping else tgt_lang ,False)

    # parser
    tree = ET.parse(infname)
    root = tree.getroot()
    
    # open files
    srcfile=codecs.open(outdir+'/{}.{}'.format(prefix,src_lang),'w','utf-8')
    tgtfile=codecs.open(outdir+'/{}.{}'.format(prefix,tgt_lang),'w','utf-8')
    idfile=codecs.open(outdir+'/{}.{}'.format(prefix,'id'),'w','utf-8')
    
    # stats
    pairs=0
    chars_src=0
    chars_org=0
    chars_norm=0

    for name in root: 
        srcnode=name.find('SourceName')
        name_id=name.attrib['ID']
    
        src_text=srcnode.text
        src_words=src_text.split(' ')

        children=None
        if prefix=='train':
            ## use for training corpus
            children=name.findall('TargetName')
        else:                        
            # use for test corpus
            children=[name.find('TargetName')]

        for tgtnode in children: 
            tgt_id=tgtnode.attrib['ID']
            tgt_text=tgtnode.text
            tgt_words=tgt_text.split(' ')
 
            # if an input entry contains multiple words 

            # Case 1: generate one line per word   
            #if len(src_words)==len(tgt_words):
            #    for offsetno, (src_word,tgt_word) in enumerate(zip(src_words,tgt_words)): 
            #        srcfile.write(u' '.join(src_word)+'\n')
            #        tgtfile.write(u' '.join(tgt_word)+'\n')
            #        idfile.write('{}_{}_{}\n'.format(name_id,tgt_id,offsetno))
   
            #        pairs+=1
            #        chars_src+=len(src_word)
            #        chars_org+=len(tgt_word)
            #        if tgt_lang in lang_code_mapping:
            #            tgt_word=normalizer.normalize(tgt_word)
            #        chars_norm+=len(tgt_word)

            # Case 2: generate just a single word 

            srcfile.write( u' _ '.join([  u' '.join(src_word.upper()) for src_word in src_words ])   +'\n')
            tgtfile.write( u' _ '.join([  u' '.join(tgt_word.upper()) for tgt_word in tgt_words ])   +'\n')
            idfile.write('{}_{}_{}\n'.format(name_id,tgt_id,0))

    print '{}|{}|{}|{}|{}|{}|{}'.format(prefix,src_lang,tgt_lang,
            pairs,chars_src,chars_org,chars_norm)

    srcfile.close()
    tgtfile.close()
    idfile.close()
コード例 #21
0
def extract_exclusive_msr_corpus(c0_dir, c1_dir, c0_lang, c1_lang, outdir):

    factory = IndicNormalizerFactory()
    l0_normalizer = factory.get_normalizer(lang_code_mapping[c0_lang])
    l1_normalizer = factory.get_normalizer(lang_code_mapping[c1_lang])

    data_cache = defaultdict(lambda: [set(), set()])

    # read corpus 0
    en0_f = codecs.open(c0_dir + '/train.En', 'r', 'utf-8')
    l0_f = codecs.open(c0_dir + '/train.' + c0_lang, 'r', 'utf-8')

    for en_l, c_l in itertools.izip(iter(en0_f), iter(l0_f)):
        data_cache[en_l.strip()][0].add(l0_normalizer.normalize(c_l.strip()))

    en0_f.close()
    l0_f.close()

    # read corpus 1
    en1_f = codecs.open(c1_dir + '/train.En', 'r', 'utf-8')
    l1_f = codecs.open(c1_dir + '/train.' + c1_lang, 'r', 'utf-8')

    for en_l, c_l in itertools.izip(iter(en1_f), iter(l1_f)):
        data_cache[en_l.strip()][1].add(l1_normalizer.normalize(c_l.strip()))

    en1_f.close()
    l1_f.close()

    # write the common data

    # from language en to c0
    xor_f = codecs.open(outdir + '/train.{}-{}'.format('En', c0_lang), 'w',
                        'utf-8')
    xor_list = []
    for en_l, other_l_lists in data_cache.iteritems():
        if (len(other_l_lists[0]) > 0 and len(other_l_lists[1]) == 0):
            other_l_lists_w = [u''.join(x.split()) for x in other_l_lists[0]]
            xor_list.append(u''.join(en_l.split()) + u'|' +
                            u'^'.join(other_l_lists_w) + u'\n')

    random.shuffle(xor_list)
    for wr in xor_list:
        xor_f.write(wr)

    xor_f.close()

    # from language en to c1
    xor_f = codecs.open(outdir + '/train.{}-{}'.format('En', c1_lang), 'w',
                        'utf-8')
    xor_list = []
    for en_l, other_l_lists in data_cache.iteritems():
        if (len(other_l_lists[0]) == 0 and len(other_l_lists[1]) > 0):
            other_l_lists_w = [u''.join(x.split()) for x in other_l_lists[1]]
            xor_list.append(u''.join(en_l.split()) + u'|' +
                            u'^'.join(other_l_lists_w) + u'\n')

    random.shuffle(xor_list)
    for wr in xor_list:
        xor_f.write(wr)

    xor_f.close()
コード例 #22
0
ファイル: tokenize_indic.py プロジェクト: bcmi220/d2gpo
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

# Use: echo {text} | python tokenize_indic.py {language}

import sys

from indicnlp.tokenize.indic_tokenize import trivial_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory

factory = IndicNormalizerFactory()
normalizer = factory.get_normalizer(sys.argv[1],
                                    remove_nuktas=False,
                                    nasals_mode='do_nothing')

for line in sys.stdin:
    normalized_line = normalizer.normalize(line.strip())
    tokenized_line = ' '.join(trivial_tokenize(normalized_line, sys.argv[1]))
    print(tokenized_line)
コード例 #23
0
import pandas as pd
import numpy as np
import glob
import Levenshtein as Lev
from tqdm import tqdm
import swifter
import argparse
from indicnlp.tokenize.indic_tokenize import trivial_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
lang = 'hi'

normalizer_factory = IndicNormalizerFactory()
normalizer = normalizer_factory.get_normalizer(lang)


def wer(s1, s2):
    """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

    # build mapping of words to integers
    b = set(s1.split() + s2.split())
    word2char = dict(zip(b, range(len(b))))

    # map the words to a char array (Levenshtein packages only accepts
    # strings)
    w1 = [chr(word2char[w]) for w in s1.split()]