n_vow_err = vds_err_df['count'].sum() + vi_err_df['count'].sum() n_cons_err = cds_err_df['count'].sum() + ci_err_df['count'].sum() ## total errors n_tot_err = a_df[a_df.ref_char != a_df.out_char]['count'].sum() # other errors n_oth_err = n_tot_err - (n_vow_err + n_cons_err) return (n_vow_err, n_cons_err, n_oth_err, n_tot_err) if __name__ == '__main__': from indicnlp import loader loader.load() #reffname=sys.argv[1] #outfname=sys.argv[2] #tgtlang=sys.argv[3] #outdir=sys.argv[4] #if not os.path.exists(outdir): # print outdir # os.mkdir(outdir) #save_analysis_artifacts(reffname, outfname, tgtlang, outdir) #a_df=read_align_count_file('/home/development/anoop/experiments/multilingual_unsup_xlit/results/sup/news_2015_official/2_multilingual/onehot_shared/multi-conf/outputs/022_analysis_en-bn/alignment_count.csv') #print char_error_rate(a_df) #print vowel_error_rate(a_df,'bn')
def create_moses_factored_run_params(conf_template_fname,conf_fname,workspace_dir,parallel_corpus,lm_file,factored_lm_dir,src_lang,tgt_lang): with codecs.open(conf_fname,'w','utf-8') as conf_file: conf_template=''.join(read_lines(conf_template_fname)) conf=conf_template.format(workspace_dir=workspace_dir,parallel_corpus=parallel_corpus,lm_file=lm_file,factored_lm_dir=factored_lm_dir,src_lang=src_lang,tgt_lang=tgt_lang) conf_file.write(conf) def create_moses_ini_params(ini_template_fname,ini_fname,numfeatures,phrasetable,lmfname,lmorder): initfeatvalues=' '.join(['0.2']*numfeatures) with codecs.open(ini_fname,'w','utf-8') as ini_file: ini_template=''.join(read_lines(ini_template_fname)) ini=ini_template.format(numfeatures=numfeatures,phrasetable=phrasetable,lmfname=lmfname,lmorder=lmorder,initfeatvalues=initfeatvalues) ini_file.write(ini) if __name__=='__main__': ### INDIC_NLP_RESOURCES environment variable must be set loader.load() command=sys.argv[1] if command=='create_synthetic_corpus_split': create_synthetic_corpus_split(sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],n_xlit=int(sys.argv[7]),n_tun=int(sys.argv[8])) elif command=='create_synthetic_corpus_concatenated': create_synthetic_corpus_concatenated(sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],n_xlit=int(sys.argv[7]),n_tun=int(sys.argv[8])) elif command=='create_moses_run_params': create_moses_run_params(*sys.argv[2:]) elif command=='create_moses_factored_run_params': create_moses_factored_run_params(*sys.argv[2:]) elif command=='create_moses_ini_params': create_moses_ini_params(sys.argv[2],sys.argv[3],int(sys.argv[4]),sys.argv[5],sys.argv[6],int(sys.argv[7])) else: print "Unknown command"
def get_split_algo(lang: str, split_algo: str) -> tp.Callable[[str], tp.Iterable[str]]: # get default algorithm if requested if split_algo == "default": # use best algorithm in function of language if lang in LANGS_MOSES: split_algo = "moses" elif lang in LANGS_INDIC: split_algo = "indic" elif lang in LANGS_GEEZ: split_algo = "geez" elif lang in LANGS_KHMER: split_algo = "khmer" elif lang in LANGS_BURMESE: split_algo = "burmese" else: # use Moses by default (which likely will fall-back to English) split_algo = "moses" logger.info(f" - default algorithm for {lang} is {split_algo}") if split_algo == "none" or lang == "TODO": logger.info(" - no sentence splitting") return lambda line: [line] elif split_algo == "moses": if lang in LANGS_MOSES: lang = LANGS_MOSES[lang] logger.info( f" - Moses sentence splitter: using rules for '{lang}'") else: lang = "en" logger.info( f" - Moses sentence splitter for {lang}: falling back to {lang} rules" ) splitter = SentenceSplitter(language=lang) # non_breaking_prefix_file=non_breaking_prefix_file return splitter.split elif split_algo == "indic": # initialize toolkit (apparently not needed for sentence segmentation) if INDIC_NLP_RESOURCES: logger.info(" - Initialize Indic NLP toolkit") indic_common.set_resources_path(INDIC_NLP_RESOURCES) indic_loader.load() if lang in LANGS_INDIC: lang = LANGS_INDIC[lang] logger.info( f" - Indic sentence splitter: using rules for '{lang}'") else: lang = "hi" logger.info( f" - Indic sentence splitter for {lang}: falling back to {lang} rules" ) # setup normalizer factory = IndicNormalizerFactory() indic_normalizer = factory.get_normalizer(lang) def split_indic(line: str) -> tp.Iterable[str]: """Split Indian text into sentences using Indic NLP tool.""" line = indic_normalizer.normalize(line) for sent in indic_sent_tok.sentence_split(line, lang=lang): yield sent return split_indic elif split_algo == "laonlp": logger.info(f" - LaoNLP sentence splitter applied to '{lang}'") return lao_sent_tok elif split_algo == "khmer": logger.info(f" - Khmer NLTK sentence splitter applied to '{lang}'") return khm_sent_tok elif split_algo == "bodnlp": logger.info(f" - Tibetan NLTK sentence splitter applied to '{lang}'") return bod_sent_tok elif split_algo == "geez": logger.info( f" - Ge'ez rule-based sentence splitter applied to '{lang}'") return split_geez elif split_algo == "burmese": logger.info( f" - Burmese rule-based sentence splitter applied to '{lang}'") return split_burmese else: logger.error(f"Unknown splitting algorithm {split_algo}") return None
def main(_): #### Load Indic NLP Library ### ## Note: Environment variable: INDIC_RESOURCES_PATH must be set loader.load() if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") print('=========== PARAMETERS ==============') print('Data Path: ' + FLAGS.data_path) print('Representation: ' + FLAGS.representation) print('Language: ' + str(FLAGS.lang)) print('Corpus Size: ' + str(FLAGS.train_size)) print('Config: ' + FLAGS.model) print('=========== PARAMETERS ==============') raw_data = reader.ptb_raw_data(FLAGS.data_path, FLAGS.lang, FLAGS.train_size) train_data, valid_data, test_data, actual_vocab_size = raw_data print('Actual Vocab Size: ' + str(actual_vocab_size)) ### set parameters config = get_config() config.vocab_size = actual_vocab_size config.lang = FLAGS.lang config.representation = FLAGS.representation eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 eval_config.vocab_size = actual_vocab_size eval_config.lang = FLAGS.lang eval_config.representation = FLAGS.representation with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config) with tf.variable_scope("model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config) mtest = PTBModel(is_training=False, config=eval_config) tf.initialize_all_variables().run() for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, train_data, m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid, valid_data, tf.no_op()) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) test_perplexity = run_epoch(session, mtest, test_data, tf.no_op()) print("Test Perplexity: %.3f" % test_perplexity)
def __init__(self,lang='en'): self.lang = lang self.stopwords = None self.stemmer = None self.sentiment_analyzer = None self.text_processor = None INDIC_NLP_RESOURCES=r"../model/indic_nlp_resources/" common.set_resources_path(INDIC_NLP_RESOURCES) self.pos_tagger = None if lang == 'hi': self.ht = HindiTokenizer.Tokenizer() self.sentiment_analyzer = load_learner(path="../model/hi-sentiment") self.stopwords = [x.strip() for x in open("../data/stopwords.txt").readlines()] other_exclusions = ["#ff", "ff", "rt"] self.stopwords.extend(other_exclusions) self.stemmer = None self.text_processor = TextPreProcessor( # terms that will be normalized normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], # terms that will be annotated annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, # fix HTML tokens ) loader.load() train_data = indian.tagged_sents('hindi.pos') self.tnt_pos_tagger = tnt.TnT() self.tnt_pos_tagger.train(train_data) if lang == 'en': self.sentiment_analyzer = VS() self.stopwords = nltk.corpus.stopwords.words("english") other_exclusions = ["#ff", "ff", "rt"] self.stopwords.extend(other_exclusions) self.stemmer = PorterStemmer() self.text_processor = TextPreProcessor( # terms that will be normalized normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], # terms that will be annotated annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons,slang] )