def __init__(self, language, ablate=None):
        """
        Define basic properties

        Args:
            language(str): language of input data
        """
        self.language = language

        if (self.language == 'english'):
            self.u_prob = file_io.read_file('data/external/english_u_prob.csv'
                                            )  #should be in data/external

        if (self.language == 'spanish'):
            self.u_prob = file_io.read_file('data/external/spanish_u_prob.csv')

        #if (self.language == 'german'):
        #   self.u_prob = file_io.read_file('data/external/german_u_prob.csv')
        # Loading the spacy vocab for tokenisation.
        if self.language == "english":
            self.nlp = spacy.load('en_core_web_lg')
        elif self.language == "spanish":
            self.nlp = spacy.load("es_core_news_md")
        elif self.language == "german":
            self.nlp = spacy.load('de_core_news_sm')
        elif self.language == "french":
            self.nlp = spacy.load('fr_core_news_md')

        self.ablate = ablate

        # load pyphen stuff here #TODO
        """Build a Frequency Index reference for spanish language"""
        """if self.language == 'spanish':
Esempio n. 2
0
    def __init__(self, language):
        """
        Define basic properties

        Args:
            language(str): language of input data
        """
        self.language = language

        if (self.language == 'english'):
            self.u_prob = file_io.read_file('data/external/english_u_prob.csv') #should be in data/external

        if (self.language == 'spanish'):
            print('reading unigram probs')
            self.u_prob = file_io.read_file('data/external/spanish_u_prob.csv')

        # Loading the spacy vocab for tokenisation.
        if self.language == "english":
            self.nlp = spacy.load('en_core_web_lg')
        elif self.language == "spanish":
            self.nlp = spacy.load("es_core_news_md")
        elif self.language == "german":
            self.nlp = spacy.load('de_core_news_sm')
        elif self.language == "french":
            self.nlp = spacy.load('fr_core_news_md')

        # load pyphen stuff here #TODO

        """Build a Frequency Index reference for spanish language"""
        if self.language == 'spanish':
            self.esp_freq_index = {}
            with open("data/external/spanish_subtitle_words_frequency_indexes.txt", "r", encoding="utf-8") as f:
                for line in f.readlines():
                    wd = line.split(",")[0]
                    FI = int(line.split(",")[1])
                    self.esp_freq_index[wd] = FI
    def __init__(self, language=None, ablate=None, features_to_use=None):
        """
        Define basic properties

        Args:
            language(str): language of input data
            features_to_use: a list of string named features to use
        """
        # This dict contains all available features, along with a list of their
        # high-computing-power requirements e.g. ['spacy']
        feature_requirements = {
            'is_nounphrase': ['spacy'],
            'len_tokens_norm': ['spacy'],
            'hypernym_count': None,
            'len_chars_norm': None,
            'len_tokens': None,
            'len_syllables': ['hyph'],
            'consonant_freq': None,
            'gr_or_lat': ['affix'],
            'is_capitalised': None,
            'num_complex_punct': None,
            'avg_chars_p_word': None,
            'sent_length': None,
            'unigram_prob': ['unigram_probs'],
            'char_n_gram_feats': None,
            'sent_n_gram_feats': None,
            'iob_tags': ['spacy'],
            'lemma_feats': ['spacy'],
            'bag_of_shapes': ['spacy'],
            'pos_tag_counts': ['spacy'],
            'NER_tag_counts': ['spacy'],
        }

        if features_to_use == None or features_to_use == 'all':
            features_to_use = list(feature_requirements.keys())

        # Total requirements is a unique list of all the requirements.
        self.total_requirements = set()
        final_features = []
        for feature in features_to_use:

            # Making sure that we know about the feature
            if feature in feature_requirements.keys():
                if feature_requirements[feature] is not None:
                    for requirement in feature_requirements[feature]:
                        self.total_requirements.add(requirement)
                final_features.append(feature)
            else:
                print(
                    "{} did not match any of the features in feature_requirements, so was not used."
                    .format(feature))

        self.features_to_use = final_features

        self.affixes = {}
        self.spacy_models = {
            'english': None,
            'spanish': None,
            'german': None,
            'french': None
        }
        self.hyph_dictionaries = {
            'english': None,
            'spanish': None,
            'german': None,
            'french': None
        }
        self.unigram_prob_dict = {
            'english': None,
            'spanish': None,
            'german': None,
            'french': None
        }

        # So that we're only opening this file once.
        if 'affix' in self.total_requirements:
            self.affixes = affix_features.get_affixes()

        if language == 'english':
            if 'spacy' in self.total_requirements:
                self.spacy_models = {'english': spacy.load('en_core_web_lg')}

            if 'hyph' in self.total_requirements:
                self.hyph_dictionaries = {'english': pyphen.Pyphen(lang='en')}

            if 'unigram_probs' in self.total_requirements:
                self.unigram_prob_dict = {
                    'english':
                    file_io.read_file('data/external/english_u_prob.csv')
                }

        elif language == 'spanish':
            if 'spacy' in self.total_requirements:
                self.spacy_models = {'spanish': spacy.load('es_core_news_md')}

            if 'hyph' in self.total_requirements:
                self.hyph_dictionaries = {'spanish': pyphen.Pyphen(lang='es')}

            if 'unigram_probs' in self.total_requirements:
                self.unigram_prob_dict = {
                    'spanish':
                    file_io.read_file('data/external/spanish_u_prob.csv')
                }

        elif language == 'german':
            if 'spacy' in self.total_requirements:
                self.spacy_models = {'german': spacy.load('de_core_news_sm')}

            if 'hyph' in self.total_requirements:
                self.hyph_dictionaries = {'german': pyphen.Pyphen(lang='de')}

            if 'unigram_probs' in self.total_requirements:
                self.unigram_prob_dict = {
                    'german':
                    file_io.read_file('data/external/german_u_prob.csv')
                }

        elif language == 'french':
            if 'spacy' in self.total_requirements:
                self.spacy_models = {'french': spacy.load('fr_core_news_md')}

            if 'hyph' in self.total_requirements:
                self.hyph_dictionaries = {'french': pyphen.Pyphen(lang='fr')}

            if 'unigram_probs' in self.total_requirements:
                self.unigram_prob_dict = {
                    'french':
                    file_io.read_file('data/external/french_u_prob.csv')
                }

        else:
            if 'spacy' in self.total_requirements:
                self.spacy_models = {
                    'english': spacy.load('en_core_web_lg'),
                    'spanish': spacy.load("es_core_news_md"),
                    'german': spacy.load('de_core_news_sm'),
                    'french': spacy.load('fr_core_news_md')
                }

            if 'hyph' in self.total_requirements:
                self.hyph_dictionaries = {
                    'english': pyphen.Pyphen(lang='en'),
                    'spanish': pyphen.Pyphen(lang='es'),
                    'german': pyphen.Pyphen(lang='de'),
                    'french': pyphen.Pyphen(lang='fr')
                }

            if 'unigram_probs' in self.total_requirements:
                self.unigram_prob_dict = {
                    'english':
                    file_io.read_file('data/external/english_u_prob.csv'),
                    'spanish':
                    file_io.read_file('data/external/spanish_u_prob.csv'),
                    'german':
                    file_io.read_file('data/external/german_u_prob.csv'),
                    'french':
                    file_io.read_file('data/external/french_u_prob.csv')
                }

        self.ablate = ablate