INDIC_NLP_LIB_HOME = "indic_nlp_library"
INDIC_NLP_RESOURCES = "indic_nlp_resources"
import sys

from indicnlp import transliterate

sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME))
from indicnlp import common

common.set_resources_path(INDIC_NLP_RESOURCES)
from indicnlp import loader

loader.load()
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer
from sacremoses import MosesDetokenizer
from collections import defaultdict

import indicnlp
from indicnlp.tokenize import indic_tokenize
from indicnlp.tokenize import indic_detokenize
from indicnlp.normalize import indic_normalize
from indicnlp.transliterate import unicode_transliterate


def postprocess(infname,
                outfname,
                input_size,
                lang,
                common_lang="hi",
                transliterate=False):
        input_array_ann2 = input_df[['ann2']].values.tolist()

        print("nominal metric: %.3f" % krippendorff_alpha(
            [sum(input_array_ann1, []),
             sum(input_array_ann2, [])],
            nominal_metric,
            missing_items=missing,
            convert_items=str))
        #print("interval metric: %.3f" % krippendorff_alpha(input_array, interval_metric, missing_items=missing,convert_items=str))


if __name__ == '__main__':
    parser = SafeConfigParser()
    config_file = sys.argv[1]
    parser.read(config_file)
    common.set_resources_path(parser.get('indic_config',
                                         'indic_resource_path'))
    NER_executor = Executor(config_file)
    NER_executor.findPOSTags()
    NER_executor.findMorphenes()
    NER_executor.find_suffix_features()
    NER_executor.mergeModuleOutputs(
        parser.get('pos_tagger', 'pos_tagger_output'),
        parser.get('morphessor', 'morpheme_output_file'),
        parser.get('ner_tag_data', 'ner_word_tags'),
        parser.get('suffix_files', 'suffix_output_file'),
        parser.get('crf_learner', 'crf_input_file'))
    #NER_executor.trainNER()
#    print(NER_executor.calculateF1Score("./final_crf_output_1"))
#NER_executor.calculateKripendorffCoeeficient("./interannotation")
Exemple #3
0
# -*- coding: utf-8 -*-

# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME="/Users/Avijit/Documents/nlp_lib"
# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES="/Users/Avijit/Documents/nlp_res"

from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)

from indicnlp import loader
loader.load()


from indicnlp.normalize.indic_normalize import IndicNormalizerFactory

input_text=u"\u0958 \u0915\u093c"
remove_nuktas=False
factory=IndicNormalizerFactory()
normalizer=factory.get_normalizer("hi",remove_nuktas)
output_text=normalizer.normalize(input_text)

print output_text
print 'Length before normalization: {}'.format(len(input_text))
print 'Length after normalization: {}'.format(len(output_text))
def get_split_algo(lang: str,
                   split_algo: str) -> tp.Callable[[str], tp.Iterable[str]]:
    # get default algorithm if requested
    if split_algo == "default":
        # use best algorithm in function of language
        if lang in LANGS_MOSES:
            split_algo = "moses"
        elif lang in LANGS_INDIC:
            split_algo = "indic"
        elif lang in LANGS_GEEZ:
            split_algo = "geez"
        elif lang in LANGS_KHMER:
            split_algo = "khmer"
        elif lang in LANGS_BURMESE:
            split_algo = "burmese"
        else:
            # use Moses by default (which likely will fall-back to English)
            split_algo = "moses"
        logger.info(f" - default algorithm for {lang} is {split_algo}")

    if split_algo == "none" or lang == "TODO":
        logger.info(" - no sentence splitting")
        return lambda line: [line]

    elif split_algo == "moses":
        if lang in LANGS_MOSES:
            lang = LANGS_MOSES[lang]
            logger.info(
                f" - Moses sentence splitter: using rules for '{lang}'")
        else:
            lang = "en"
            logger.info(
                f" - Moses sentence splitter for {lang}: falling back to {lang} rules"
            )
        splitter = SentenceSplitter(language=lang)
        # non_breaking_prefix_file=non_breaking_prefix_file
        return splitter.split

    elif split_algo == "indic":
        # initialize toolkit (apparently not needed for sentence segmentation)
        if INDIC_NLP_RESOURCES:
            logger.info(" - Initialize Indic NLP toolkit")
            indic_common.set_resources_path(INDIC_NLP_RESOURCES)
            indic_loader.load()
        if lang in LANGS_INDIC:
            lang = LANGS_INDIC[lang]
            logger.info(
                f" - Indic sentence splitter: using rules for '{lang}'")
        else:
            lang = "hi"
            logger.info(
                f" - Indic sentence splitter for {lang}: falling back to {lang} rules"
            )

        # setup normalizer
        factory = IndicNormalizerFactory()
        indic_normalizer = factory.get_normalizer(lang)

        def split_indic(line: str) -> tp.Iterable[str]:
            """Split Indian text into sentences using Indic NLP tool."""
            line = indic_normalizer.normalize(line)
            for sent in indic_sent_tok.sentence_split(line, lang=lang):
                yield sent

        return split_indic

    elif split_algo == "laonlp":
        logger.info(f" - LaoNLP sentence splitter applied to '{lang}'")
        return lao_sent_tok

    elif split_algo == "khmer":
        logger.info(f" - Khmer NLTK sentence splitter applied to '{lang}'")
        return khm_sent_tok

    elif split_algo == "bodnlp":
        logger.info(f" - Tibetan NLTK sentence splitter applied to '{lang}'")
        return bod_sent_tok

    elif split_algo == "geez":
        logger.info(
            f" - Ge'ez rule-based sentence splitter applied to '{lang}'")
        return split_geez

    elif split_algo == "burmese":
        logger.info(
            f" - Burmese rule-based sentence splitter applied to '{lang}'")
        return split_burmese

    else:
        logger.error(f"Unknown splitting algorithm {split_algo}")

    return None
    def __init__(self,lang='en'):
        self.lang = lang
        self.stopwords = None
        self.stemmer = None
        self.sentiment_analyzer = None
        self.text_processor = None        
        INDIC_NLP_RESOURCES=r"../model/indic_nlp_resources/"        
        common.set_resources_path(INDIC_NLP_RESOURCES)
        self.pos_tagger = None



        if lang == 'hi':
            self.ht = HindiTokenizer.Tokenizer()
            self.sentiment_analyzer = load_learner(path="../model/hi-sentiment")
            self.stopwords = [x.strip() for x in open("../data/stopwords.txt").readlines()]	
            other_exclusions = ["#ff", "ff", "rt"]
            self.stopwords.extend(other_exclusions)
            self.stemmer = None
            self.text_processor = TextPreProcessor(
                # terms that will be normalized
                normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'number'],
                # terms that will be annotated
                annotate={"hashtag", "allcaps", "elongated", "repeated",
                    'emphasis', 'censored'},
                fix_html=True,  # fix HTML tokens
            )
            loader.load()
            train_data = indian.tagged_sents('hindi.pos')
            self.tnt_pos_tagger = tnt.TnT()
            self.tnt_pos_tagger.train(train_data)

        if lang == 'en':
            self.sentiment_analyzer = VS()
            self.stopwords = nltk.corpus.stopwords.words("english")
            other_exclusions = ["#ff", "ff", "rt"]
            self.stopwords.extend(other_exclusions)
            self.stemmer = PorterStemmer()
            self.text_processor = TextPreProcessor(
                # terms that will be normalized
                normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'number'],
                # terms that will be annotated
                annotate={"hashtag", "allcaps", "elongated", "repeated",
                    'emphasis', 'censored'},
                fix_html=True,  # fix HTML tokens

                # corpus from which the word statistics are going to be used 
                # for word segmentation 
                segmenter="twitter", 

                # corpus from which the word statistics are going to be used 
                # for spell correction
                corrector="twitter", 

                unpack_hashtags=True,  # perform word segmentation on hashtags
                unpack_contractions=True,  # Unpack contractions (can't -> can not)
                spell_correct_elong=False,  # spell correction for elongated words

                # select a tokenizer. You can use SocialTokenizer, or pass your own
                # the tokenizer, should take as input a string and return a list of tokens
                tokenizer=SocialTokenizer(lowercase=True).tokenize,

                # list of dictionaries, for replacing tokens extracted from the text,
                # with other expressions. You can pass more than one dictionaries.
                dicts=[emoticons,slang]
            )