def __prepareFeatures(self, dataset: str): # find or create the features file self.__calculateTokensEntropyLoss(dataset) # get features file languageFeatures = json.loads( FileManager.readFile(FileManager.getFeaturesFileUrl(self.type))) X = [] Y = [] sources, languages = self.extractSources(dataset) for idx, source in enumerate(sources): language = languages[idx] features = [] tokens = set(source.split(' ')) # X for _lang in languageFeatures: for _tk in languageFeatures[_lang]: if _tk in tokens: features.append(1) else: features.append(0) X.append(features) # Y Y.append(language) return X, Y
def __calculateTokensEntropyLoss(self, dataset: str): if os.path.exists(FileManager.getFeaturesFileUrl(self.type)): return self sources, languages = self.extractSources(dataset) withTokensOccurencyMap: dict = {} withoutTokensOccurencyMap: dict = {} for index, source in enumerate(sources): language = languages[index] tokens = set(source.split(' ')) for token in tokens: if token not in withTokensOccurencyMap: withTokensOccurencyMap[token] = [] withTokensOccurencyMap[token].append(language) for index, source in enumerate(sources): language = languages[index] tokens = set(source.split(' ')) for token in withTokensOccurencyMap: if token not in tokens: if token not in withoutTokensOccurencyMap: withoutTokensOccurencyMap[token] = [] withoutTokensOccurencyMap[token].append(language) tokensMetrics: dict = {} for language in ConfigurationManager.getLanguages(): tokensMetrics[language] = {} for token in withTokensOccurencyMap: tokensMetrics[language][token] = {} tokensMetrics[language][token][ 'numberOfExamplesWithFeatureF']: int = len( withTokensOccurencyMap[token]) tokensMetrics[language][token][ 'numberOfExamplesWithoutFeatureF']: int = len( withoutTokensOccurencyMap[token]) tokensMetrics[language][token][ 'numberOfPositiveExamplesWithFeatureF']: int = len([ lg for lg in withTokensOccurencyMap[token] if lg == language ]) tokensMetrics[language][token][ 'numberOfPositiveExamplesWithoutFeatureF']: int = len([ lg for lg in withoutTokensOccurencyMap[token] if lg == language ]) languageFeatures = {} tokensEntropyLoss: dict = {} numberOfExamples = self.Dataset.countExamples(dataset) N_OF_TOKENS_FOR_LANGUAGE: int = self.config[ 'number_of_tokens_for_language'] for language in ConfigurationManager.getLanguages(): tokensEntropyLoss[language] = {} numberOfPositiveExamples: int = self.Dataset.getCounters( dataset)[language] for token in tokensMetrics[language]: tokensEntropyLoss[language][token] = 0 metrics = tokensMetrics[language][token] numberOfExamplesWithFeatureF = metrics[ 'numberOfExamplesWithFeatureF'] numberOfExamplesWithoutFeatureF = metrics[ 'numberOfExamplesWithoutFeatureF'] numberOfPositiveExamplesWithFeatureF = metrics[ 'numberOfPositiveExamplesWithFeatureF'] numberOfPositiveExamplesWithoutFeatureF = metrics[ 'numberOfPositiveExamplesWithoutFeatureF'] # preparing entropy formula vars pr_C: float = numberOfPositiveExamples / numberOfExamples pr_f: float = numberOfExamplesWithFeatureF / numberOfExamples pr_C_f: float = numberOfPositiveExamplesWithFeatureF / numberOfExamplesWithFeatureF pr_C_notf: float = numberOfPositiveExamplesWithoutFeatureF / numberOfExamplesWithoutFeatureF # TODO: use this https://scikit-learn.org/stable/modules/preprocessing.html#scaling-features-to-a-range pr_C = (pr_C if pr_C > 0 else .0001) pr_f = (pr_f if pr_f > 0 else .0001) pr_C_f = (pr_C_f if pr_C_f > 0 else .0001) pr_C_notf = (pr_C_notf if pr_C_notf > 0 else .0001) # TODO: use this https://scikit-learn.org/stable/modules/preprocessing.html#scaling-features-to-a-range pr_C = (pr_C if pr_C < 1 else .9999) pr_f = (pr_f if pr_f < 1 else .9999) pr_C_f = (pr_C_f if pr_C_f < 1 else .9999) pr_C_notf = (pr_C_notf if pr_C_notf < 1 else .9999) # calculating token's entropy e = -(pr_C * math.log2(pr_C)) - ( (1 - pr_C) * math.log2(1 - pr_C)) e_f = -(pr_C_f * math.log2(pr_C_f)) - ( (1 - pr_C_f) * math.log2(1 - pr_C_f)) e_not_f = -(pr_C_notf * math.log2(pr_C_notf)) - ( (1 - pr_C_notf) * math.log2(1 - pr_C_notf)) tokensEntropyLoss[language][token] = e - (e_f * pr_f) + (e_not_f * (1 - pr_f)) # sort entropy values by desc order tokensEntropyLoss[language] = { k: v for k, v in sorted(tokensEntropyLoss[language].items(), key=lambda item: item[1]) } # take first n tokens languageFeatures[language] = list( tokensEntropyLoss[language].keys())[:N_OF_TOKENS_FOR_LANGUAGE] # export tokens with maximum entropy loss FileManager.writeFile(FileManager.getFeaturesFileUrl(self.type), json.dumps(languageFeatures)) return self