Ejemplo n.º 1
0
 def __init__(self):
     
     self.CuratedList = self.loadCuratedList()
     self.stop_words = set(stopwords.words('arabic'))
     self.arStemmer = Analyzer(MorphologyDB.builtin_db())
     self.sentSegRegexPattern = self.loadSentSegmentationList()        
     self.DotChar = '_'
Ejemplo n.º 2
0
def _analyze(db, fin, fout, backoff, cache):
    if cache:
        analyzer = Analyzer(db, backoff, cache_size=1024)
    else:
        analyzer = Analyzer(db, backoff)

    line = force_unicode(fin.readline())

    while line:
        if len(line) == 0:
            line = force_unicode(fin.readline())
            continue

        line = line.strip()
        tokens = _tokenize(line)

        for token in tokens:
            analyses = analyzer.analyze(token)

            serialized = _serialize_analyses(fout, token, analyses, db.order)

            if six.PY3:
                fout.write(serialized)
            else:
                fout.write(force_encoding(serialized))

            fout.write('\n\n')

        line = force_unicode(fin.readline())
Ejemplo n.º 3
0
    def __init__(self, db):
        if not isinstance(db, MorphologyDB):
            raise ReinflectorError('DB is not an instance of MorphologyDB')
        if not db.flags.generation:
            raise ReinflectorError('DB does not support reinflection')

        self._db = db

        self._analyzer = Analyzer(db)
        self._generator = Generator(db)
Ejemplo n.º 4
0
    def pretrained(model_name='msa',
                   top=1,
                   use_gpu=True,
                   batch_size=32,
                   cache_size=10000):
        """Load a pre-trained model provided with camel_tools.

        Args:
            model_name (:obj:`str`, optional): Name of pre-trained model to
                load. Three models are available: 'msa', 'egy', and 'glf.
                Defaults to `msa`.
            top (:obj:`int`, optional): The maximum number of top analyses to
                return. Defaults to 1.
            use_gpu (:obj:`bool`, optional): The flag to use a GPU or not.
                Defaults to True.
            batch_size (:obj:`int`, optional): The batch size. Defaults to 32.
            cache_size (:obj:`int`, optional): If greater than zero, then
                the analyzer will cache the analyses for the cache_size most
                frequent words, otherwise no analyses will be cached.
                Defaults to 100000.

        Returns:
            :obj:`BERTUnfactoredDisambiguator`: Instance with loaded
            pre-trained model.
        """

        model_info = CATALOGUE.get_dataset('DisambigBertUnfactored',
                                           model_name)
        model_config = _read_json(Path(model_info.path, 'default_config.json'))
        model_path = str(model_info.path)
        features = FEATURE_SET_MAP[model_config['feature']]
        db = MorphologyDB.builtin_db(model_config['db_name'], 'a')
        analyzer = Analyzer(db,
                            backoff=model_config['backoff'],
                            cache_size=cache_size)
        scorer = model_config['scorer']
        tie_breaker = model_config['tie_breaker']
        ranking_cache = model_config['ranking_cache']

        return BERTUnfactoredDisambiguator(model_path,
                                           analyzer,
                                           top=top,
                                           features=features,
                                           scorer=scorer,
                                           tie_breaker=tie_breaker,
                                           use_gpu=use_gpu,
                                           batch_size=batch_size,
                                           ranking_cache=ranking_cache)
Ejemplo n.º 5
0
    def pretrained_from_config(config,
                               top=1,
                               use_gpu=True,
                               batch_size=32,
                               cache_size=10000):
        """Load a pre-trained model from a config file.

            Args:
                config (:obj:`str`): Config file that defines the model
                    details. Defaults to `None`.
                top (:obj:`int`, optional): The maximum number of top analyses
                    to return. Defaults to 1.
                use_gpu (:obj:`bool`, optional): The flag to use a GPU or not.
                    Defaults to True.
                batch_size (:obj:`int`, optional): The batch size.
                    Defaults to 32.
                cache_size (:obj:`int`, optional): If greater than zero, then
                    the analyzer will cache the analyses for the cache_size
                    most frequent words, otherwise no analyses will be cached.
                    Defaults to 100000.

            Returns:
                :obj:`BERTUnfactoredDisambiguator`: Instance with loaded
                pre-trained model.
            """

        model_config = _read_json(config)
        model_path = model_config['model_path']
        features = FEATURE_SET_MAP[model_config['feature']]
        db = MorphologyDB(model_config['db_path'], 'a')
        analyzer = Analyzer(db,
                            backoff=model_config['backoff'],
                            cache_size=cache_size)
        scorer = model_config['scorer']
        tie_breaker = model_config['tie_breaker']
        ranking_cache = model_config['ranking_cache']

        return BERTUnfactoredDisambiguator(model_path,
                                           analyzer,
                                           top=top,
                                           features=features,
                                           scorer=scorer,
                                           tie_breaker=tie_breaker,
                                           use_gpu=use_gpu,
                                           batch_size=batch_size,
                                           ranking_cache=ranking_cache)
Ejemplo n.º 6
0
def load(lang, nlp=None):
    # Make sure the language is supported
    supported = {"en", "ar"}
    if lang not in supported:
        raise Exception("%s is an unsupported or unknown language" % lang)

    if lang == "en":
        # Load spacy
        nlp = nlp or spacy.load(lang, disable=["ner"])

        # Load language edit merger
        merger = import_module("errant.%s.merger" % lang)

        # Load language edit classifier
        classifier = import_module("errant.%s.classifier" % lang)
        # The English classifier needs spacy
        classifier.nlp = nlp

        # Return a configured ERRANT annotator
        return Annotator(lang, nlp, merger, classifier)

    if lang == "ar":
        # Load spacy
        # nlp = nlp or spacy.load(lang, disable=["ner"])
        db = MorphologyDB.builtin_db()
        analyzer = Analyzer(db)
        mled = MLEDisambiguator.pretrained()
        tagger = DefaultTagger(mled, 'pos')
        nlp = [analyzer, tagger]

        # Load language edit merger
        merger = import_module("errant.%s.merger" % lang)

        # Load language edit classifier
        classifier = import_module("errant.%s.classifier" % lang)
        # The English classifier needs spacy
        #classifier.nlp = nlp

        # Return a configured ERRANT annotator
        return Annotator(lang, nlp, merger, classifier)
Ejemplo n.º 7
0
def _calima_egy_r13_analyzer():
    db = MorphologyDB.builtin_db('calima-egy-r13', 'a')
    analyzer = Analyzer(db, 'NOAN_PROP')
    return analyzer
Ejemplo n.º 8
0
class Reinflector(object):
    """Morphological reinflector component.

    Arguments:
        db (:obj:`~camel_tools.morphology.database.MorphologyDB`): Database to
            use for generation. Must be opened in reinflection mode or both
            analysis and generation modes.

    Raises:
        :obj:`~camel_tools.morphology.errors.ReinflectorError`: If **db** is
            not an instance of
            :obj:`~camel_tools.morphology.database.MorphologyDB` or if **db**
            does not support reinflection.
    """
    def __init__(self, db):
        if not isinstance(db, MorphologyDB):
            raise ReinflectorError('DB is not an instance of MorphologyDB')
        if not db.flags.generation:
            raise ReinflectorError('DB does not support reinflection')

        self._db = db

        self._analyzer = Analyzer(db)
        self._generator = Generator(db)

    def reinflect(self, word, feats):
        """Generate analyses for a given word from a given set of inflectional
        features.

        Arguments:
            word (:obj:`str`): Word to reinflect.
            feats (:obj:`dict`): Dictionary of features.
                See :doc:`/reference/camel_morphology_features` for more
                information on features and their values.

        Returns:
            :obj:`list` of :obj:`dict`: List of generated analyses.
            See :doc:`/reference/camel_morphology_features` for more
            information on features and their values.

        Raises:
            :obj:`~camel_tools.morphology.errors.InvalidReinflectorFeature`:
                If a feature is given that is not defined in database.
            :obj:`~camel_tools.morphology.errors.InvalidReinflectorFeatureValue`:
                If an invalid value is given to a feature or if 'pos' feature
                is not defined.
        """

        analyses = self._analyzer.analyze(word)

        if not analyses or len(analyses) == 0:
            return []

        for feat in feats:
            if feat not in self._db.defines:
                raise InvalidReinflectorFeature(feat)
            elif self._db.defines[feat] is not None:
                if feat in _ANY_FEATS and feats[feat] == 'ANY':
                    continue
                elif feats[feat] not in self._db.defines[feat]:
                    raise InvalidReinflectorFeatureValue(feat, feats[feat])

        has_clitics = False
        for feat in _CLITIC_FEATS:
            if feat in feats:
                has_clitics = True
                break

        results = deque()

        for analysis in analyses:
            if dediac_ar(analysis['diac']) != dediac_ar(word):
                continue

            if 'pos' in feats and feats['pos'] != analysis['pos']:
                continue

            lemma = _LEMMA_SPLIT_RE.split(analysis['lex'])[0]

            if 'lex' in feats and feats['lex'] != lemma:
                continue

            is_valid = True
            generate_feats = {}

            for feat in analysis.keys():
                if feat in _IGNORED_FEATS:
                    continue
                elif feat in _SPECIFIED_FEATS and feat not in feats:
                    continue
                elif has_clitics and feat in _CLITIC_IGNORED_FEATS:
                    continue
                else:
                    if feat in feats:
                        if feats[feat] == 'ANY':
                            continue
                        elif analysis[feat] != 'na':
                            generate_feats[feat] = feats[feat]
                        else:
                            is_valid = False
                            break
                    elif analysis[feat] != 'na':
                        generate_feats[feat] = analysis[feat]

            if is_valid:
                generated = self._generator.generate(lemma, generate_feats)
                if generated is not None:
                    results.extend(generated)

        # TODO: Temporary fix to get unique analyses
        results = [dict(y) for y in set(tuple(x.items()) for x in results)]

        return list(results)
Ejemplo n.º 9
0
class TClean():


    def __init__(self):
        
        self.CuratedList = self.loadCuratedList()
        self.stop_words = set(stopwords.words('arabic'))
        self.arStemmer = Analyzer(MorphologyDB.builtin_db())
        self.sentSegRegexPattern = self.loadSentSegmentationList()        
        self.DotChar = '_'
        
    #
    #
    #

    def loadCuratedList(self):
          
        curatedFile = open('../resources/CuratedList.txt', 'r', encoding="utf-8") 
        cList = {}
        while True:       
            strLine = curatedFile.readline()             
            if not strLine: 
                break
            strKeyVal = strLine.replace('\n', '').split(":::")
            self.add_if_key_not_exist(cList, strKeyVal[0], strKeyVal[1])  
        curatedFile.close()   
        
        return cList
    
    #
    #
    #
      
    def loadSentSegmentationList(self):      
        sent_segmentationFile = open('../resources/sent_segmentation_list.txt', 'r', encoding="utf-8") 
        delimiterList = []    
        while True:       
            strLine = sent_segmentationFile.readline()             
            if not strLine: 
                break
            strLine = ' ' + strLine.replace('\n', '').strip() + ' '
            delimiterList.append(strLine) 
        sent_segmentationFile.close()
        
        return '(' +  '|'.join(map(regEx.escape, delimiterList)) + ')'         

    #
    #
    #
       
    def getSentTokenization (self, strDoc):
        return sent_tokenize(strDoc)
    #
    #
    #
    def getWTokens(self, strTxt):        
        return word_tokenize(strTxt)
    #
    #
    #
    
    def getSegSentTokenization(self, strSentence, minSeqSentLen=30):
        
        if len(strSentence)<=minSeqSentLen:
            strSent =[]
            strSent.append(strSentence)
            return strSent
        return regEx.split(self.sentSegRegexPattern, strSentence)
    
    #
    #
    #
        
    def softCleaning (self, strText):
        
        #
        # Remove newline
        strText = strText.replace('\n', ' ')
        
        #
        # Remove Tashkeel
        strText = dediac_ar(strText)
        
        #
        # Clean by replacing any matched token with any item in the curated list .. 
        for incorrectToken, correctedToken in self.CuratedList.items():            
            strText = strText.replace(incorrectToken, correctedToken)
        
        #
        # fix coma and semicolon ..
        strText = self.replaceWrongComa(strText)
        
        #
        # remove extra spaces 
        strText = regEx.sub(" +", " ", strText)
        
        return strText

    #
    #
    #
    
    def hardCleaning (self, strText, removeStopWord=False, applyLemmatize=False):
        
        #
        #
        # Apply soft cleaning first
        strText = self.softCleaning(strText)
        
        #
        # Normailse 
        strText = normalize_teh_marbuta_ar(strText)   # for Alha
        strText = normalize_alef_ar(strText)          # for Alhamza
        strText = normalize_alef_maksura_ar(strText) 
        
        #
        #
        strText = self.removeNonArabicChar(strText)
        
        #
        #
        strText = self.lemmatizeAndRemoveDotFromToken(strText, removeStopWord, applyLemmatize)
        
        # Remove final sentence-dots
        #strText = strText.replace('.', ' ')
        return strText

    #
    #
    #
  

    def replaceWrongComa(self, strText):
        
        # to keep coma and semicolon 
        strText = strText.replace(",", "،").replace(";", "؛").replace("?", "؟")
        #
        # to add space for correct sepration ..
        strText = strText.replace("،", " ، ").replace("؛ ","؛ ").replace("؟ ","؟ ").replace(":", " : ").replace(".", " . ")

        return strText
    #
    #
    #
    
    def removeNonArabicChar(self, strText):
        
        # 
        # remove english and non-arabic (including special) characters 
        strText = regEx.compile('([^\n\u060C-\u064A\.:؟?])').sub(' ', strText)
        #
        # remove extra spaces 
        return regEx.sub(" +", " ", strText)
    #
    #
    #

    def lemmatizeAndRemoveDotFromToken (
            self, strDoc,
            removeStopWord=False,
            applyLemmatize=False):
        
        getTokens = word_tokenize(strDoc)        
        strDoc = ""
        
        for strToken in getTokens:
            #
            sT= strToken.strip()
            #
            # skip if it's a stop word
            if removeStopWord and sT in self.stop_words:
                continue
            #
            #
            if applyLemmatize:
                sT = self.getStemWToken(sT)                
            #
            # check Dots
            if '.' in sT and len(sT)>1:
                sT = sT.replace(".", self.DotChar)                 
            #
            #
            if len(sT)<2 and '.' not in sT:
                continue
               
            strDoc += sT + ' ' 
            
        return strDoc.strip()

    #
    #
    #

    def getStemWToken(self, wToken):
        #
        try:            
            stemObject = self.arStemmer.analyze(wToken)
            
            # Remove Tashkeel and Normailse
            strText = dediac_ar(stemObject[0]['stem'])
            strText = normalize_teh_marbuta_ar(strText)   # for Alha
            strText = normalize_alef_ar(strText)          # for Alhamza
            strText = normalize_alef_maksura_ar(strText)         
            return strText
        except:
            return wToken
       
 
        
    #
    #
    #

    def add_if_key_not_exist(self, dict_obj, key, value):
        if key not in dict_obj:
            dict_obj.update({key: value})
    
    #
    #
    #
    
    def toRound(self, dVal, iDigits =2):
        return np.round(dVal, iDigits)

    #
    #
    #

    def readTxtFile (self, strPath):
        with open(strPath, 'r', encoding="utf-8") as file:
            return file.read().replace("\n", " ")
    if s_size > max_sentence:
        max_sentence = s_size

    sentence_size += s_size
    

    fd.close()
    


print(min_sentence, max_sentence, sentence_size/len(sentences))

# Extract Morphological properties of every word from corpus

db = MorphologyDB.builtin_db()
analyzer = Analyzer(db)

# # Create analyzer with NOAN_PROP backoff
# analyzer = Analyzer(db, 'NOAN_PROP')

training_set = []

for sentence in sentences:
    s = []
    for word in sentence:
        
        analyses = analyzer.analyze(word['INPUT STRING'])
        # print(word, analyses)
        for d in analyses:
            # print(get_tag(d['bw']) == sentences[0][0]['POS'])
            tag = get_tag(d['bw'])