Ejemplo n.º 1
0
def run(V, n, d):
    #n
    size = n
    #V
    vocab = V
    #d
    smoothing = d

    in_file1 = "./OriginalDataSet/training-tweets.txt"
    n_gram = N_Gram(in_file1, size, vocab, smoothing)
    [uni, bi, tri] = n_gram.count()
    language_count = n_gram.language_count

    tweets = []

    eu = Language('eu', uni['eu'], bi['eu'], tri['eu'])
    es = Language('es', uni['es'], bi['es'], tri['es'])
    pt = Language('pt', uni['pt'], bi['pt'], tri['pt'])
    en = Language('en', uni['en'], bi['en'], tri['en'])
    ca = Language('ca', uni['ca'], bi['ca'], tri['ca'])
    gl = Language('gl', uni['gl'], bi['gl'], tri['gl'])

    eu.cal_conditional_probabilities(size, smoothing)
    es.cal_conditional_probabilities(size, smoothing)
    pt.cal_conditional_probabilities(size, smoothing)
    en.cal_conditional_probabilities(size, smoothing)
    ca.cal_conditional_probabilities(size, smoothing)
    gl.cal_conditional_probabilities(size, smoothing)

    eu.export_model(vocab, size, smoothing)
    es.export_model(vocab, size, smoothing)
    pt.export_model(vocab, size, smoothing)
    en.export_model(vocab, size, smoothing)
    ca.export_model(vocab, size, smoothing)
    gl.export_model(vocab, size, smoothing)

    in_file2 = "./OriginalDataSet/test-tweets-given.txt"
    input_file = open(in_file2, "r", encoding="utf8")

    for line in input_file:
        try:
            [tweet_id, user_name, language, text] = line.split('\t')
            tweets.append(
                Tweet(tweet_id, user_name, language, text.strip('\n')))
        except:
            print('ERROR reading file')
    input_file.close()

    overall_result = Counter()
    language_result = {
        'eu': Counter(),
        'ca': Counter(),
        'gl': Counter(),
        'es': Counter(),
        'en': Counter(),
        'pt': Counter()
    }
    language_predictions = Counter()
    debug = 0

    if os.path.exists(f'ModifiedDataSet/trace_{vocab}_{size}_{smoothing}.txt'):
        os.remove(f'ModifiedDataSet/trace_{vocab}_{size}_{smoothing}.txt')
    for tweet in tweets:
        try:
            if vocab == 0:
                tweet.lower_case()
            elif vocab == 1:
                tweet.case_sensitive()
            elif vocab == 2:
                tweet.is_alpha()
            tweet.counter()

            count = {}
            if size == 1:
                count = tweet.uni
            elif size == 2:
                count = tweet.bi
            elif size == 3:
                count = tweet.tri

            score = {}

            score['eu'] = eu.score(count, language_count)
            score['es'] = es.score(count, language_count)
            score['ca'] = ca.score(count, language_count)
            score['gl'] = gl.score(count, language_count)
            score['pt'] = pt.score(count, language_count)
            score['en'] = en.score(count, language_count)

            estimate_l = max(score, key=score.get)
            estimate_s = score[estimate_l]

            if debug < 0:
                print(tweet.tweet_id)
                print(score)
                print(estimate_l, tweet.language)

            language_predictions[estimate_l] += 1
            if estimate_l == tweet.language:
                overall_result['right'] += 1
                language_result[tweet.language]['right'] += 1
            else:
                overall_result['wrong'] += 1
                language_result[tweet.language]['wrong'] += 1

            # Trace Output File
            with open(f'ModifiedDataSet/trace_{vocab}_{size}_{smoothing}.txt',
                      'a',
                      encoding='utf8') as trace_file:
                correct_wrong = 'correct' if estimate_l == tweet.language else 'wrong'
                trace_file.write(
                    f'{tweet.tweet_id}  {estimate_l}  {estimate_s:.2E}  {tweet.language}  {correct_wrong}\n'
                )

            debug += 1
        except Exception as error_msg:
            print(f'ERROR calculating score: {error_msg}')

    # Eval Output File
    with open(f'ModifiedDataSet/eval_{vocab}_{size}_{smoothing}.txt',
              'w',
              encoding='utf8') as eval_file:
        accuracy = round(
            overall_result['right'] / sum(overall_result.values()), 4)
        per_class_precision = []
        per_class_recall = []
        for language in language_result:
            per_class_precision.append(
                round(
                    language_result[language]['right'] /
                    language_predictions[language],
                    4)) if language_predictions[
                        language] > 0 else per_class_precision.append(0)
            per_class_recall.append(
                round(
                    language_result[language]['right'] /
                    sum(language_result[language].values()), 4))
        per_class_f1 = [
            round((x * y) / (x + y), 2) if x > 0 or y > 0 else 0.0
            for x, y in zip(per_class_precision, per_class_recall)
        ]
        macro_f1 = round(sum(per_class_f1) / len(per_class_f1), 4)

        weighted_f1 = 0
        for index, language in enumerate(language_result):
            weighted_f1 += sum(
                language_result[language].values()) * per_class_f1[index]
        weighted_f1 = round(weighted_f1 / sum(overall_result.values()), 4)

        eval_file.write(f'{accuracy}\n')
        eval_file.writelines(f'{c}  ' for c in per_class_precision)
        eval_file.write('\n')
        eval_file.writelines(f'{c}  ' for c in per_class_recall)
        eval_file.write('\n')
        eval_file.writelines(f'{c}  ' for c in per_class_f1)
        eval_file.write('\n')
        eval_file.write(f'{macro_f1}  {weighted_f1}')

    print('right: ',
          (overall_result['right'] / sum(overall_result.values())) * 100, '%')
    print('wrong: ',
          (overall_result['wrong'] / sum(overall_result.values())) * 100, '%')