Exemple #1
0
def clearCorpus():
	#****************** START Model generator ********************************
	os.system('clear')
	print ('\n\n{}'.format('='*100))
	print ('AUTHOMATIC LANGUAGE IDENTIFIER USING CUMMULATIVE FREQUENCY ADDITION - CORPUS CLEANER'.center(100,' '))
	print ('-'*100)	

	print ("\nLoading corpus files to memory ... ")
	path = 'corpus/rawSource/'
	
	started = datetime.datetime.now()
	
	for infile in glob.glob(os.path.join(path, '*.txt')): #opens files from directory
		try:
			#Extract the file name			
			filename = infile.split('/')[-1]
			lang = filename[:2]

			#open and read file from corpus
			f=open(infile,'r', encoding = 'utf8' )
			rawtext = [lang,f.read()]
			f.close()

			print ('-'*100) 
			print ('\nOpening relevant files ...  \t\t\t\t\t\t{}'.format(l.timer(started)))

			cleantext=l.regex(rawtext)[1] #source file content set, i.e. vocabulary
			
			path1 = 'corpus/cleanSource/'
			if os.path.isfile(os.path.join(path1, filename)): os.remove(os.path.join(path1, filename))
			c=open(os.path.join(path1, filename),'a+')
			c.write(str(cleantext))
			c.close()

			print ('\nSuccessfuly cleande {} file '.format(filename))
			
		except IOError:
			print ('Error: Can not open the file: ',lang) ;	return	
	else:
	
		print ('\nStarted:', started)
		ended = datetime.datetime.now()
		elapsed = ended - started
		print ('End    :', ended)
		print ('Elapsed:', elapsed)
Exemple #2
0
def classification(frequencyDict,
                   uniquengrams,
                   totalngrams,
                   phraselength=25,
                   wordbased=0,
                   location=0,
                   infinity=0,
                   maxg=5):

    #*************************** START Reading Files ************************************
    started = datetime.datetime.now()

    s = open('sample.txt', 'r')  #, encoding = 'utf8' )

    print('-' * 100)
    print('\nFiles {} loaded to memory ...  \t\t\t\t\t\t{}'.format(
        'sample.txt', l.timer(started)))
    mytime = datetime.datetime.now()
    print('Opening relevant files ...  \t\t\t\t\t\t\t{}'.format(
        l.timer(mytime)))
    mytime = datetime.datetime.now()

    # model = f.readlines()
    #matrix = m.read()
    print('Reading language models ...  \t\t\t\t\t\t\t{}'.format(
        l.timer(mytime)))
    mytime = datetime.datetime.now()

    sample = s.readlines()
    # f.close()
    s.close()
    #m.close()
    #*************************** END Reading Files ***************************************
    print('Reading test strings ...  \t\t\t\t\t\t\t{}'.format(l.timer(mytime)))
    mytime = datetime.datetime.now()

    readsampled = l.readsample(sample, phraselength)
    sampled = readsampled[0]
    averagebyte = int(readsampled[1])
    averagecharacters = int(readsampled[2])

    testing = [
    ]  #[['am', [['እው', 2], ['ውነ', 2], ['ነት', 2], ['እውነ', 3], ['ውነት', 3], ['እውነት', 4]]]

    temp = []
    phrases = 0
    for i in sampled:
        if wordbased == 0:
            testing.append(l.ngram(l.regex(i), 1))
            phrases += 1
        else:
            temp = l.regex(i)[1].split()
            wordlist = []
            for n in temp:
                if location == 0:
                    wordlist.extend(
                        l.ngram([i[0], n], 1, location, infinity)[1])
                else:
                    wordlist.extend(
                        l.ngram([i[0], n], 1, location, infinity)[1])
            testing.append([i[0], wordlist])
            phrases += 1

    grams = []
    for i in range(2, maxg + 1):
        grams.append(i)

    lang = dict(am={}, ge={}, gu={}, ti={})
    base = {'CFA': {}, 'NBC': {}}
    wrongs = copy.deepcopy(base)
    mytotals = copy.deepcopy(base)
    fscore = copy.deepcopy(base)

    mytotal = copy.deepcopy(lang)

    classifiers = {'CFA': 0, 'NBC': 0}
    averageprecision = copy.deepcopy(classifiers)
    averageaccuracy = copy.deepcopy(classifiers)
    averagefscore = copy.deepcopy(classifiers)
    averagerecall = copy.deepcopy(classifiers)
    averagetotal = copy.deepcopy(classifiers)

    for i in grams:
        fscore['CFA'][i] = 0
        fscore['NBC'][i] = 0

    precision = copy.deepcopy(fscore)
    total = copy.deepcopy(fscore)
    totaltests = copy.deepcopy(fscore)
    recall = copy.deepcopy(fscore)
    accuracy = copy.deepcopy(fscore)

    totals = {}

    for i in mytotal:
        for j in grams:
            totals[j] = 0
        mytotal[i] = copy.deepcopy(totals)

    mytotals['CFA'] = copy.deepcopy(mytotal)
    mytotals['NBC'] = copy.deepcopy(mytotal)
    myrecall = copy.deepcopy(mytotals)

    #wrong classifications like amharic classified as guragigna
    for i in mytotal:
        wrongs['CFA'][i] = copy.deepcopy(mytotal)
        wrongs['NBC'][i] = copy.deepcopy(mytotal)

    confusion = copy.deepcopy(wrongs)

    # print (testing);return
    print('Creating language dictionaries ...  \t\t\t\t\t\t{}'.format(
        l.timer(mytime)))
    mytime = datetime.datetime.now()

    l.myclassifier(testing, frequencyDict, grams, wrongs, totaltests, myrecall,
                   total, uniquengrams, totalngrams, phrases)

    print('\tPerforming classifications ...  \t\t\t\t\t{}'.format(
        l.timer(mytime)))
    mytime = datetime.datetime.now()

    for i in confusion['CFA']:
        for j in confusion['CFA'][i]:
            if i == j:
                confusion['CFA'][i][j] = myrecall['CFA'][j]
                confusion['NBC'][i][j] = myrecall['NBC'][j]
            else:
                confusion['CFA'][i][j] = wrongs['CFA'][i][j]
                confusion['NBC'][i][j] = wrongs['NBC'][i][j]

    for g in grams:

        for i in lang:
            numerator = 0
            denominator = 0
            n = 0
            d = 0
            for j in confusion['CFA']:
                if i == j:
                    numerator += confusion['CFA'][j][i][g]
                    n += confusion['NBC'][j][i][g]
                denominator += confusion['CFA'][j][i][g]
                d += confusion['NBC'][j][i][g]

            precision['CFA'][g] += (numerator / denominator /
                                    4) if denominator != 0 else 0
            precision['NBC'][g] += (n / d / 4) if d != 0 else 0

        for x in confusion['CFA']:
            numerator = 0
            denominator = 0
            n = 0
            d = 0
            for y in lang:
                if x == y:
                    numerator += confusion['CFA'][x][y][g]
                    n += confusion['NBC'][x][y][g]
                denominator += confusion['CFA'][x][y][g]
                d += confusion['NBC'][x][y][g]

            recall['CFA'][g] += (numerator / denominator /
                                 4) if denominator != 0 else 0
            recall['NBC'][g] += (n / d / 4) if d != 0 else 0
            accuracy['CFA'][g] += numerator
            accuracy['NBC'][g] += n

        accuracy['CFA'][g] /= total['CFA'][g] if total['CFA'][g] != 0 else 1
        accuracy['NBC'][g] /= total['NBC'][g] if total['NBC'][g] != 0 else 1

        averageaccuracy['CFA'] += accuracy['CFA'][g] / 4
        averageaccuracy['NBC'] += accuracy['NBC'][g] / 4

        averagetotal['CFA'] += totaltests['CFA'][g]
        averagetotal['NBC'] += totaltests['NBC'][g]

        averageprecision['CFA'] += precision['CFA'][g] / 4
        averageprecision['NBC'] += precision['NBC'][g] / 4

        averagerecall['CFA'] += recall['CFA'][g] / 4
        averagerecall['NBC'] += recall['NBC'][g] / 4

    for g in grams:
        fscore['CFA'][g] = 2 * (
            (precision['CFA'][g] * recall['CFA'][g]) /
            (precision['CFA'][g] + recall['CFA'][g])) if (
                precision['CFA'][g] != 0.00 or recall['CFA'][g]) != 0.00 else 0
        fscore['NBC'][g] = 2 * (
            (precision['NBC'][g] * recall['NBC'][g]) /
            (precision['NBC'][g] + recall['NBC'][g])) if (
                precision['NBC'][g] != 0.00 or recall['NBC'][g]) != 0.00 else 0
        averagefscore['CFA'] += fscore['CFA'][g] / 4
        averagefscore['NBC'] += fscore['NBC'][g] / 4

    print(
        'Generating performance metrices - precision, recall and f-score ...  \t\t{}'
        .format(l.timer(mytime)))
    mytime = datetime.datetime.now()

    if os.path.isfile('result.txt'): os.remove('result.txt')
    # r=open('result.txt','a+')

    for i in classifiers:
        print(
            '\nAverage length of test strings: {:,} word(s) / {:,} character(s) / {:,} bytes'
            .format(phraselength, averagecharacters, averagebyte))
        print('=' * 100)
        print('{:<16}|{:<15}|{:<15}|{:<15}|{:<15}|{:<15}'.format(
            'Ngrams', 'Observations', 'Accuracy', 'Precision', 'Recall',
            'F-score'))
        print('-' * 100)

        for g in grams:
            print(
                '{:<3} {:<10}\t|{:,}\t\t|{:10.4f}\t|{:10.4f}\t|{:10.4f}\t|{:10.4f}'
                .format(i, g, totaltests[i][g], accuracy[i][g],
                        precision[i][g], recall[i][g], fscore[i][g]))
        print('-' * 100)

        print(
            '{:<3} {:<10}\t|{:,}\t\t|{:10.4f}\t|{:10.4f}\t|{:10.4f}\t|{:10.4f}'
            .format(i, '(2,3,4,5)', averagetotal[i], averageaccuracy[i],
                    averageprecision[i], averagerecall[i], averagefscore[i]))
        print('-' * 100)

    print('\nGenerating clasification performance results ...  \t\t\t\t{}'.
          format(l.timer(mytime)))
    print('\nStarted:', started)
    ended = datetime.datetime.now()
    print('End    :', ended)
    print('Elapsed: {}'.format(l.timer(started)))
Exemple #3
0
def sampling(selection=10):
    #****************** START Model generator ********************************
    print(
        "\nLoading testing files to memory and generating testing samples ... "
    )
    files = '*.txt'

    started = datetime.datetime.now()
    language = dict(am='Amharic', ge='Geez', gu='Guragigna', ti='Tigrigna')
    base = dict(am=0, ge=0, gu=0, ti=0)
    counts = {}

    for i in range(1, selection + 1):
        counts[i] = copy.deepcopy(base)

    for ct in range(1, 11):
        path = 'corpus/testing/' + str(ct) + '/'
        path1 = 'samples/'
        samples = {}
        duplicate = set()
        filename = str(ct) + '.txt'
        totals = 0
        for i in range(1, selection + 1):
            counts[i] = copy.deepcopy(base)
        try:
            if os.path.isfile(os.path.join(path1, filename)):
                os.remove(os.path.join(path1, filename))
            s = open(os.path.join(path1, filename), 'a+')

            print('\n\n{}\nGenerating sample {}\n{}'.format(
                '=' * 110, ct, '-' * 110))
            for infile in glob.glob(os.path.join(
                    path, files)):  #opens files from director

                cfilename = infile.split('/')[-1]
                lang = cfilename[:2]

                c = open(infile, 'r')
                raw = c.read()
                c.close()

                #Word level parsing
                listed = l.regex([lang, raw])[1].strip()

                t = [0]
                length = len(listed)

                for i in re.finditer(' ', listed):
                    t.append(i.start())
                t.append(length)

                print(
                    '\n{} - Completed identifying phrase markers for {} language'
                    .format(datetime.datetime.now(), language[lang]))

                test = []
                r = len(t) - 1

                for x, i in enumerate(t):
                    print('\b' * 40, end='')
                    print('{} - {:0.2f}%'.format(datetime.datetime.now(),
                                                 (x / r) * 100),
                          end='')
                    for j in t:
                        if (j >= t[x]): continue
                        temp = listed[j:t[x]].strip()
                        length = len(temp.split())
                        if length > selection or temp in duplicate or len(
                                temp) < 2:
                            continue
                        temps = [lang, temp, sys.getsizeof(temp)]
                        test.append(temps)
                        duplicate.add(temp)

                samples[lang] = len(test)
                print(
                    '\n{} - Created {:,} test phrases for {} language'.format(
                        datetime.datetime.now(), samples[lang],
                        language[lang]))

                r = len(test) - 1

                for x, temp in enumerate(test):
                    print('\b' * 40, end='')
                    print('{} - {:0.2f}%'.format(datetime.datetime.now(),
                                                 (x / r) * 100),
                          end='')

                    words = len(temp[1].split())
                    counts[words][temp[0]] += 1

                    #[lang,string,number of words,number of characters,bytes]
                    s.write(
                        str(temp[0]) + "," + str(temp[1]) + "," + str(words) +
                        "," + str(len(temp[1].strip())) + "," + str(temp[2]) +
                        str('\r\n'))
            else:
                s.close()
        except IOError:
            print('Error: Can not open the file: ',
                  'Corpus file {}.txt'.format(ct))
            return

        totals = sum(samples.values())
        print('\n\nsamples {}'.format(samples))

        print('\nWords \t Phrases \t Details')
        for i in range(1, selection + 1):
            inlang = sum(counts[i].values())
            print('{:<3} \t {:,} \t\t {}'.format(i, inlang, counts[i]))

        print('\n{:,} Test strings are successfuly created in {}{}.'.format(
            totals, path1, filename))
        print('\nStarted:', started)
        ended = datetime.datetime.now()
        elapsed = ended - started
        print('End    :', ended)
        print('Elapsed:', elapsed)
        print('\n')
Exemple #4
0
def modeler(started, mod, wordbased=0, location=0, infinity=0):
    #****************** START Model generator ********************************
    print('\n{}'.format('=' * 110))
    print('\n{} - Loading corpus files to memory and generating models ...'.
          format(datetime.datetime.now()))

    files = '*.txt'
    mostfrequent = {}  #the most frequent ngram in the corpus
    wordcount = {}
    vocabulary = {}
    language = dict(am='Amharic', ge='Geez', gu='Guragigna', ti='Tigrigna')
    maxg = 5

    started = datetime.datetime.now()
    myfile = mod + '.txt'
    for ct in range(1, 11):
        path = 'corpus/training/' + str(ct) + '/'
        model = []
        print('\n{}\nProcessing corpus {}\n{}'.format('=' * 110, ct,
                                                      '-' * 110))
        for infile in glob.glob(os.path.join(
                path, files)):  #opens files from director
            try:

                #Extract the file name
                filename = infile.split('/')[-1]
                lang = filename[:2]

                #Open and read file from corpus
                f = open(infile, 'r')  #, encoding = 'utf8' )
                raw = f.read()
                rawtext = [lang, raw]
                f.close()

                #Word/Ngram level parsing
                ngrams = []
                '''\nSelect Model type number below: \n\n   
				1. The Model is based on Fixed Length N-grams without location features - Baseline [bl]. \n
				2. The Model is based on source text - Byteorder N-grams [by]. \n
				3. The Model is based on Fixed Length N-grams with location features [fl]. \n
				4. The Model is based on Infiniti-grams without location features [in]. \n   
				5. The Model is based on Infiniti-grams with location features [il]. \n
				6. The Model is based on Word Frequency without location features [wr]. \n
				7. Exit.:   '))
				'''
                if mod == 'bl':  #to generate ngrams from source text wordbased baseline
                    temp = l.regex(rawtext)[1].split()
                    wordlist = []
                    for i in temp:
                        wordlist.extend(l.ngram([lang, i])[1])
                    ngrams = [lang, wordlist, mod]

                elif mod == 'by':  #to generate ngrams from source text fixed byteorder
                    ngrams = l.ngram(l.regex(rawtext))
                    ngrams.append(mod)

                elif mod == 'fl':  #to generate ngrams model (inifinigram) from words taken from source files
                    temp = l.regex(rawtext)[1].split()
                    wordlist = []
                    for i in temp:
                        wordlist.extend(
                            l.ngram([lang, i], 0, location, infinity)[1])
                    ngrams = [lang, wordlist, mod]

                elif mod == 'il' or mod == 'in':  #to generate ngrams model (inifinigram) from words taken from source files
                    temp = l.regex(rawtext)[1].split()
                    wordlist = []
                    for i in temp:
                        maxg = len(i) if maxg < len(
                            i
                        ) else maxg  #Indicate the model type [T-top n,B-byteorder,I-infinitigram,L-infinitigram with location feature, W-word frequency model]
                        if location == 0:
                            wordlist.extend(
                                l.ngram([lang, i], 0, location, infinity)[1])
                        else:
                            wordlist.extend(
                                l.ngram([lang, i], 0, location, infinity)[1])
                    ngrams = [lang, wordlist, mod]

                elif mod == 'wr':  #to generate word frequency model from source files
                    temp = l.regex(rawtext)[1].split()
                    ngrams = [lang, temp, mod]

                print(
                    '\t{} - Completed removing punctuation marks and numbers for {} language'
                    .format(datetime.datetime.now(), language[lang]))
                summary = l.summerize(model, l.wordgrams(ngrams), mostfrequent,
                                      wordcount, vocabulary)

                print(
                    '\t{} - Completed building sorted frequency distribution for {} language'
                    .format(datetime.datetime.now(), language[lang]))
                print('{}{}'.format('\t', '-' * 100))

            except IOError:
                print('Error: Can not open the file: ', lang)
                return
        print('{} - Saving the model for all languages to models/{}/{}'.format(
            datetime.datetime.now(), ct, myfile))

        l.savetofile(summary, started, mod, maxg, ct)
Exemple #5
0
def clearCommons():
	#****************** START Model generator ********************************
	os.system('clear')
	print ('\n\n{}'.format('='*100))
	print ('AUTHOMATIC LANGUAGE IDENTIFIER USING CUMMULATIVE FREQUENCY ADDITION - COMMONS REMOVER'.center(100,' '))
	print ('-'*100)	

	print ("\nLoading corpus files to memory ... ")
	path = 'corpus/cr/300'
	
	alllist =[] ; vocabulary = set()
	started = datetime.datetime.now()
	content = {}
	language = dict(am='Amharic',ge='Geez',gu='Guragigna',ti='Tigrigna')
	
	commons =set()
	
	for infile in glob.glob(os.path.join(path, '*.txt')): #opens files from directory
		try:
			#Extract the file name			
			filename = infile.split('/')[-1]
			lang = filename[:2]

			#open and read file from corpus
			f=open(infile,'r', encoding = 'utf8' )
			rawtext = [lang,f.read()]
			f.close()

			print ('-'*100) 
			print ('\nOpening relevant files ...  \t\t\t\t\t\t{}'.format(l.timer(started)))

			content[lang]=set(l.regex(rawtext)[1].split()) #source file content set, i.e. vocabulary
			
			listed = l.regex(rawtext)[1].split() ##source file content total words list
			alllist.append(listed)

			for i in content: #update the set vocabulary with the union of itself and a new list.
				vocabulary.update(content[i])

			print('{} - Completed building relevant dictionaries for {} language'.format(datetime.datetime.now(),language[lang]))
		
		except IOError:
			print ('Error: Can not open the file: ',lang)
			return	
	
	r = len(vocabulary) ; w = 0
	for i in content:
		w+= len(content[i])

	print ('-'*100)
	print('{} - Matching {:,} vocabulary items to {:,} ngrams in all language'.format(datetime.datetime.now(),r,w))
	
	for i in content:	#checking and adding only the common occuraces in each set
		for j in content:
			if i==j:continue
			commons.update(content[i].intersection(content[j]))

	path1 = 'corpus/cc/300'
	for infile in glob.glob(os.path.join(path, '*.txt')):
		filename = infile.split('/')[-1] ; lang = filename[:2]
		if os.path.isfile(os.path.join(path1, filename)): os.remove(os.path.join(path1, filename))
		
		f=open(infile,'r')
		rawtext = l.regex([lang,f.read()])[1]
		f.close()
		
		cleared = ' '.join(filter(lambda x: x not in commons,  rawtext.split()))
		c=open(os.path.join(path1, filename),'a+')
		c.write(str(cleared))
		c.close()

	print ('\nA total of {} common terms in listed in commons.txt are removed from the corpus '.format(len(commons)))

	if os.path.isfile('commons.txt'): os.remove('commons.txt')
	s=open('commons.txt','a+')
	s.write(str(commons))
	s.close()
	
	print ('\nStarted:', started)
	ended = datetime.datetime.now()
	elapsed = ended - started
	print ('End    :', ended)
	print ('Elapsed:', elapsed)
Exemple #6
0
def corper():
    #****************** START Model generator ********************************
    print(
        "\nLoading testing files to memory and generating testing samples ... "
    )
    files = '*.txt'

    started = datetime.datetime.now()
    language = dict(am='Amharic', ge='Geez', gu='Guragigna', ti='Tigrigna')

    path = 'corpus/cleanSource/'  #;
    # try:
    for infile in glob.glob(os.path.join(path,
                                         files)):  #opens files from directory

        filename = infile.split('/')[-1]
        lang = filename[:2]

        print('\n\n{}\nGenerating Test and Trainin corpus for {}\n{}'.format(
            '=' * 110, language[lang], '-' * 110))

        c = open(infile, 'r')
        raw = c.read()
        c.close()

        #Word level parsing
        listed = l.regex([lang, raw])[1].strip()

        length = len(listed)

        partition = int(length / 10)

        marker = [x * partition for x in range(0, 11)]

        for j, i in enumerate(marker):
            if j == i: continue
            path1 = 'corpus/training/' + str(j) + '/'
            path2 = 'corpus/testing/' + str(j) + '/'
            # print (path1,path2,filename); return
            if os.path.isfile(os.path.join(path1, filename)):
                os.remove(os.path.join(path1, filename))
            if os.path.isfile(os.path.join(path2, filename)):
                os.remove(os.path.join(path2, filename))

            testing = listed[marker[j - 1]:i]
            training = ' '.join([listed[:marker[j - 1]], listed[i:]])

            tr = open(os.path.join(path1, filename), 'a+')
            ts = open(os.path.join(path2, filename), 'a+')

            ts.write(str(testing))
            tr.write(str(training))

            ts.close()
            tr.close()

            print(
                'Corpus length: {}\t Training length: {}\t Testing Length: {}'.
                format(length, len(training), len(testing)))
            print(
                '\n{} - Completed creating and training and testing corpus {} in file {}'
                .format(datetime.datetime.now(), j, filename))

            # print('training {} {}'.format(marker[j-1],len(listed[i:])))
            # print('{} training {}'.format(j,training))

            # print('testing {} {}'.format(j-1,i))
            # print('{} testing {}'.format(j,testing))

    else:
        print('\nStarted:', started)
        ended = datetime.datetime.now()
        elapsed = ended - started
        print('End    :', ended)
        print('Elapsed:', elapsed)
        print('\n')