def clearCorpus(): #****************** START Model generator ******************************** os.system('clear') print ('\n\n{}'.format('='*100)) print ('AUTHOMATIC LANGUAGE IDENTIFIER USING CUMMULATIVE FREQUENCY ADDITION - CORPUS CLEANER'.center(100,' ')) print ('-'*100) print ("\nLoading corpus files to memory ... ") path = 'corpus/rawSource/' started = datetime.datetime.now() for infile in glob.glob(os.path.join(path, '*.txt')): #opens files from directory try: #Extract the file name filename = infile.split('/')[-1] lang = filename[:2] #open and read file from corpus f=open(infile,'r', encoding = 'utf8' ) rawtext = [lang,f.read()] f.close() print ('-'*100) print ('\nOpening relevant files ... \t\t\t\t\t\t{}'.format(l.timer(started))) cleantext=l.regex(rawtext)[1] #source file content set, i.e. vocabulary path1 = 'corpus/cleanSource/' if os.path.isfile(os.path.join(path1, filename)): os.remove(os.path.join(path1, filename)) c=open(os.path.join(path1, filename),'a+') c.write(str(cleantext)) c.close() print ('\nSuccessfuly cleande {} file '.format(filename)) except IOError: print ('Error: Can not open the file: ',lang) ; return else: print ('\nStarted:', started) ended = datetime.datetime.now() elapsed = ended - started print ('End :', ended) print ('Elapsed:', elapsed)
def classification(frequencyDict, uniquengrams, totalngrams, phraselength=25, wordbased=0, location=0, infinity=0, maxg=5): #*************************** START Reading Files ************************************ started = datetime.datetime.now() s = open('sample.txt', 'r') #, encoding = 'utf8' ) print('-' * 100) print('\nFiles {} loaded to memory ... \t\t\t\t\t\t{}'.format( 'sample.txt', l.timer(started))) mytime = datetime.datetime.now() print('Opening relevant files ... \t\t\t\t\t\t\t{}'.format( l.timer(mytime))) mytime = datetime.datetime.now() # model = f.readlines() #matrix = m.read() print('Reading language models ... \t\t\t\t\t\t\t{}'.format( l.timer(mytime))) mytime = datetime.datetime.now() sample = s.readlines() # f.close() s.close() #m.close() #*************************** END Reading Files *************************************** print('Reading test strings ... \t\t\t\t\t\t\t{}'.format(l.timer(mytime))) mytime = datetime.datetime.now() readsampled = l.readsample(sample, phraselength) sampled = readsampled[0] averagebyte = int(readsampled[1]) averagecharacters = int(readsampled[2]) testing = [ ] #[['am', [['እው', 2], ['ውነ', 2], ['ነት', 2], ['እውነ', 3], ['ውነት', 3], ['እውነት', 4]]] temp = [] phrases = 0 for i in sampled: if wordbased == 0: testing.append(l.ngram(l.regex(i), 1)) phrases += 1 else: temp = l.regex(i)[1].split() wordlist = [] for n in temp: if location == 0: wordlist.extend( l.ngram([i[0], n], 1, location, infinity)[1]) else: wordlist.extend( l.ngram([i[0], n], 1, location, infinity)[1]) testing.append([i[0], wordlist]) phrases += 1 grams = [] for i in range(2, maxg + 1): grams.append(i) lang = dict(am={}, ge={}, gu={}, ti={}) base = {'CFA': {}, 'NBC': {}} wrongs = copy.deepcopy(base) mytotals = copy.deepcopy(base) fscore = copy.deepcopy(base) mytotal = copy.deepcopy(lang) classifiers = {'CFA': 0, 'NBC': 0} averageprecision = copy.deepcopy(classifiers) averageaccuracy = copy.deepcopy(classifiers) averagefscore = copy.deepcopy(classifiers) averagerecall = copy.deepcopy(classifiers) averagetotal = copy.deepcopy(classifiers) for i in grams: fscore['CFA'][i] = 0 fscore['NBC'][i] = 0 precision = copy.deepcopy(fscore) total = copy.deepcopy(fscore) totaltests = copy.deepcopy(fscore) recall = copy.deepcopy(fscore) accuracy = copy.deepcopy(fscore) totals = {} for i in mytotal: for j in grams: totals[j] = 0 mytotal[i] = copy.deepcopy(totals) mytotals['CFA'] = copy.deepcopy(mytotal) mytotals['NBC'] = copy.deepcopy(mytotal) myrecall = copy.deepcopy(mytotals) #wrong classifications like amharic classified as guragigna for i in mytotal: wrongs['CFA'][i] = copy.deepcopy(mytotal) wrongs['NBC'][i] = copy.deepcopy(mytotal) confusion = copy.deepcopy(wrongs) # print (testing);return print('Creating language dictionaries ... \t\t\t\t\t\t{}'.format( l.timer(mytime))) mytime = datetime.datetime.now() l.myclassifier(testing, frequencyDict, grams, wrongs, totaltests, myrecall, total, uniquengrams, totalngrams, phrases) print('\tPerforming classifications ... \t\t\t\t\t{}'.format( l.timer(mytime))) mytime = datetime.datetime.now() for i in confusion['CFA']: for j in confusion['CFA'][i]: if i == j: confusion['CFA'][i][j] = myrecall['CFA'][j] confusion['NBC'][i][j] = myrecall['NBC'][j] else: confusion['CFA'][i][j] = wrongs['CFA'][i][j] confusion['NBC'][i][j] = wrongs['NBC'][i][j] for g in grams: for i in lang: numerator = 0 denominator = 0 n = 0 d = 0 for j in confusion['CFA']: if i == j: numerator += confusion['CFA'][j][i][g] n += confusion['NBC'][j][i][g] denominator += confusion['CFA'][j][i][g] d += confusion['NBC'][j][i][g] precision['CFA'][g] += (numerator / denominator / 4) if denominator != 0 else 0 precision['NBC'][g] += (n / d / 4) if d != 0 else 0 for x in confusion['CFA']: numerator = 0 denominator = 0 n = 0 d = 0 for y in lang: if x == y: numerator += confusion['CFA'][x][y][g] n += confusion['NBC'][x][y][g] denominator += confusion['CFA'][x][y][g] d += confusion['NBC'][x][y][g] recall['CFA'][g] += (numerator / denominator / 4) if denominator != 0 else 0 recall['NBC'][g] += (n / d / 4) if d != 0 else 0 accuracy['CFA'][g] += numerator accuracy['NBC'][g] += n accuracy['CFA'][g] /= total['CFA'][g] if total['CFA'][g] != 0 else 1 accuracy['NBC'][g] /= total['NBC'][g] if total['NBC'][g] != 0 else 1 averageaccuracy['CFA'] += accuracy['CFA'][g] / 4 averageaccuracy['NBC'] += accuracy['NBC'][g] / 4 averagetotal['CFA'] += totaltests['CFA'][g] averagetotal['NBC'] += totaltests['NBC'][g] averageprecision['CFA'] += precision['CFA'][g] / 4 averageprecision['NBC'] += precision['NBC'][g] / 4 averagerecall['CFA'] += recall['CFA'][g] / 4 averagerecall['NBC'] += recall['NBC'][g] / 4 for g in grams: fscore['CFA'][g] = 2 * ( (precision['CFA'][g] * recall['CFA'][g]) / (precision['CFA'][g] + recall['CFA'][g])) if ( precision['CFA'][g] != 0.00 or recall['CFA'][g]) != 0.00 else 0 fscore['NBC'][g] = 2 * ( (precision['NBC'][g] * recall['NBC'][g]) / (precision['NBC'][g] + recall['NBC'][g])) if ( precision['NBC'][g] != 0.00 or recall['NBC'][g]) != 0.00 else 0 averagefscore['CFA'] += fscore['CFA'][g] / 4 averagefscore['NBC'] += fscore['NBC'][g] / 4 print( 'Generating performance metrices - precision, recall and f-score ... \t\t{}' .format(l.timer(mytime))) mytime = datetime.datetime.now() if os.path.isfile('result.txt'): os.remove('result.txt') # r=open('result.txt','a+') for i in classifiers: print( '\nAverage length of test strings: {:,} word(s) / {:,} character(s) / {:,} bytes' .format(phraselength, averagecharacters, averagebyte)) print('=' * 100) print('{:<16}|{:<15}|{:<15}|{:<15}|{:<15}|{:<15}'.format( 'Ngrams', 'Observations', 'Accuracy', 'Precision', 'Recall', 'F-score')) print('-' * 100) for g in grams: print( '{:<3} {:<10}\t|{:,}\t\t|{:10.4f}\t|{:10.4f}\t|{:10.4f}\t|{:10.4f}' .format(i, g, totaltests[i][g], accuracy[i][g], precision[i][g], recall[i][g], fscore[i][g])) print('-' * 100) print( '{:<3} {:<10}\t|{:,}\t\t|{:10.4f}\t|{:10.4f}\t|{:10.4f}\t|{:10.4f}' .format(i, '(2,3,4,5)', averagetotal[i], averageaccuracy[i], averageprecision[i], averagerecall[i], averagefscore[i])) print('-' * 100) print('\nGenerating clasification performance results ... \t\t\t\t{}'. format(l.timer(mytime))) print('\nStarted:', started) ended = datetime.datetime.now() print('End :', ended) print('Elapsed: {}'.format(l.timer(started)))
def sampling(selection=10): #****************** START Model generator ******************************** print( "\nLoading testing files to memory and generating testing samples ... " ) files = '*.txt' started = datetime.datetime.now() language = dict(am='Amharic', ge='Geez', gu='Guragigna', ti='Tigrigna') base = dict(am=0, ge=0, gu=0, ti=0) counts = {} for i in range(1, selection + 1): counts[i] = copy.deepcopy(base) for ct in range(1, 11): path = 'corpus/testing/' + str(ct) + '/' path1 = 'samples/' samples = {} duplicate = set() filename = str(ct) + '.txt' totals = 0 for i in range(1, selection + 1): counts[i] = copy.deepcopy(base) try: if os.path.isfile(os.path.join(path1, filename)): os.remove(os.path.join(path1, filename)) s = open(os.path.join(path1, filename), 'a+') print('\n\n{}\nGenerating sample {}\n{}'.format( '=' * 110, ct, '-' * 110)) for infile in glob.glob(os.path.join( path, files)): #opens files from director cfilename = infile.split('/')[-1] lang = cfilename[:2] c = open(infile, 'r') raw = c.read() c.close() #Word level parsing listed = l.regex([lang, raw])[1].strip() t = [0] length = len(listed) for i in re.finditer(' ', listed): t.append(i.start()) t.append(length) print( '\n{} - Completed identifying phrase markers for {} language' .format(datetime.datetime.now(), language[lang])) test = [] r = len(t) - 1 for x, i in enumerate(t): print('\b' * 40, end='') print('{} - {:0.2f}%'.format(datetime.datetime.now(), (x / r) * 100), end='') for j in t: if (j >= t[x]): continue temp = listed[j:t[x]].strip() length = len(temp.split()) if length > selection or temp in duplicate or len( temp) < 2: continue temps = [lang, temp, sys.getsizeof(temp)] test.append(temps) duplicate.add(temp) samples[lang] = len(test) print( '\n{} - Created {:,} test phrases for {} language'.format( datetime.datetime.now(), samples[lang], language[lang])) r = len(test) - 1 for x, temp in enumerate(test): print('\b' * 40, end='') print('{} - {:0.2f}%'.format(datetime.datetime.now(), (x / r) * 100), end='') words = len(temp[1].split()) counts[words][temp[0]] += 1 #[lang,string,number of words,number of characters,bytes] s.write( str(temp[0]) + "," + str(temp[1]) + "," + str(words) + "," + str(len(temp[1].strip())) + "," + str(temp[2]) + str('\r\n')) else: s.close() except IOError: print('Error: Can not open the file: ', 'Corpus file {}.txt'.format(ct)) return totals = sum(samples.values()) print('\n\nsamples {}'.format(samples)) print('\nWords \t Phrases \t Details') for i in range(1, selection + 1): inlang = sum(counts[i].values()) print('{:<3} \t {:,} \t\t {}'.format(i, inlang, counts[i])) print('\n{:,} Test strings are successfuly created in {}{}.'.format( totals, path1, filename)) print('\nStarted:', started) ended = datetime.datetime.now() elapsed = ended - started print('End :', ended) print('Elapsed:', elapsed) print('\n')
def modeler(started, mod, wordbased=0, location=0, infinity=0): #****************** START Model generator ******************************** print('\n{}'.format('=' * 110)) print('\n{} - Loading corpus files to memory and generating models ...'. format(datetime.datetime.now())) files = '*.txt' mostfrequent = {} #the most frequent ngram in the corpus wordcount = {} vocabulary = {} language = dict(am='Amharic', ge='Geez', gu='Guragigna', ti='Tigrigna') maxg = 5 started = datetime.datetime.now() myfile = mod + '.txt' for ct in range(1, 11): path = 'corpus/training/' + str(ct) + '/' model = [] print('\n{}\nProcessing corpus {}\n{}'.format('=' * 110, ct, '-' * 110)) for infile in glob.glob(os.path.join( path, files)): #opens files from director try: #Extract the file name filename = infile.split('/')[-1] lang = filename[:2] #Open and read file from corpus f = open(infile, 'r') #, encoding = 'utf8' ) raw = f.read() rawtext = [lang, raw] f.close() #Word/Ngram level parsing ngrams = [] '''\nSelect Model type number below: \n\n 1. The Model is based on Fixed Length N-grams without location features - Baseline [bl]. \n 2. The Model is based on source text - Byteorder N-grams [by]. \n 3. The Model is based on Fixed Length N-grams with location features [fl]. \n 4. The Model is based on Infiniti-grams without location features [in]. \n 5. The Model is based on Infiniti-grams with location features [il]. \n 6. The Model is based on Word Frequency without location features [wr]. \n 7. Exit.: ')) ''' if mod == 'bl': #to generate ngrams from source text wordbased baseline temp = l.regex(rawtext)[1].split() wordlist = [] for i in temp: wordlist.extend(l.ngram([lang, i])[1]) ngrams = [lang, wordlist, mod] elif mod == 'by': #to generate ngrams from source text fixed byteorder ngrams = l.ngram(l.regex(rawtext)) ngrams.append(mod) elif mod == 'fl': #to generate ngrams model (inifinigram) from words taken from source files temp = l.regex(rawtext)[1].split() wordlist = [] for i in temp: wordlist.extend( l.ngram([lang, i], 0, location, infinity)[1]) ngrams = [lang, wordlist, mod] elif mod == 'il' or mod == 'in': #to generate ngrams model (inifinigram) from words taken from source files temp = l.regex(rawtext)[1].split() wordlist = [] for i in temp: maxg = len(i) if maxg < len( i ) else maxg #Indicate the model type [T-top n,B-byteorder,I-infinitigram,L-infinitigram with location feature, W-word frequency model] if location == 0: wordlist.extend( l.ngram([lang, i], 0, location, infinity)[1]) else: wordlist.extend( l.ngram([lang, i], 0, location, infinity)[1]) ngrams = [lang, wordlist, mod] elif mod == 'wr': #to generate word frequency model from source files temp = l.regex(rawtext)[1].split() ngrams = [lang, temp, mod] print( '\t{} - Completed removing punctuation marks and numbers for {} language' .format(datetime.datetime.now(), language[lang])) summary = l.summerize(model, l.wordgrams(ngrams), mostfrequent, wordcount, vocabulary) print( '\t{} - Completed building sorted frequency distribution for {} language' .format(datetime.datetime.now(), language[lang])) print('{}{}'.format('\t', '-' * 100)) except IOError: print('Error: Can not open the file: ', lang) return print('{} - Saving the model for all languages to models/{}/{}'.format( datetime.datetime.now(), ct, myfile)) l.savetofile(summary, started, mod, maxg, ct)
def clearCommons(): #****************** START Model generator ******************************** os.system('clear') print ('\n\n{}'.format('='*100)) print ('AUTHOMATIC LANGUAGE IDENTIFIER USING CUMMULATIVE FREQUENCY ADDITION - COMMONS REMOVER'.center(100,' ')) print ('-'*100) print ("\nLoading corpus files to memory ... ") path = 'corpus/cr/300' alllist =[] ; vocabulary = set() started = datetime.datetime.now() content = {} language = dict(am='Amharic',ge='Geez',gu='Guragigna',ti='Tigrigna') commons =set() for infile in glob.glob(os.path.join(path, '*.txt')): #opens files from directory try: #Extract the file name filename = infile.split('/')[-1] lang = filename[:2] #open and read file from corpus f=open(infile,'r', encoding = 'utf8' ) rawtext = [lang,f.read()] f.close() print ('-'*100) print ('\nOpening relevant files ... \t\t\t\t\t\t{}'.format(l.timer(started))) content[lang]=set(l.regex(rawtext)[1].split()) #source file content set, i.e. vocabulary listed = l.regex(rawtext)[1].split() ##source file content total words list alllist.append(listed) for i in content: #update the set vocabulary with the union of itself and a new list. vocabulary.update(content[i]) print('{} - Completed building relevant dictionaries for {} language'.format(datetime.datetime.now(),language[lang])) except IOError: print ('Error: Can not open the file: ',lang) return r = len(vocabulary) ; w = 0 for i in content: w+= len(content[i]) print ('-'*100) print('{} - Matching {:,} vocabulary items to {:,} ngrams in all language'.format(datetime.datetime.now(),r,w)) for i in content: #checking and adding only the common occuraces in each set for j in content: if i==j:continue commons.update(content[i].intersection(content[j])) path1 = 'corpus/cc/300' for infile in glob.glob(os.path.join(path, '*.txt')): filename = infile.split('/')[-1] ; lang = filename[:2] if os.path.isfile(os.path.join(path1, filename)): os.remove(os.path.join(path1, filename)) f=open(infile,'r') rawtext = l.regex([lang,f.read()])[1] f.close() cleared = ' '.join(filter(lambda x: x not in commons, rawtext.split())) c=open(os.path.join(path1, filename),'a+') c.write(str(cleared)) c.close() print ('\nA total of {} common terms in listed in commons.txt are removed from the corpus '.format(len(commons))) if os.path.isfile('commons.txt'): os.remove('commons.txt') s=open('commons.txt','a+') s.write(str(commons)) s.close() print ('\nStarted:', started) ended = datetime.datetime.now() elapsed = ended - started print ('End :', ended) print ('Elapsed:', elapsed)
def corper(): #****************** START Model generator ******************************** print( "\nLoading testing files to memory and generating testing samples ... " ) files = '*.txt' started = datetime.datetime.now() language = dict(am='Amharic', ge='Geez', gu='Guragigna', ti='Tigrigna') path = 'corpus/cleanSource/' #; # try: for infile in glob.glob(os.path.join(path, files)): #opens files from directory filename = infile.split('/')[-1] lang = filename[:2] print('\n\n{}\nGenerating Test and Trainin corpus for {}\n{}'.format( '=' * 110, language[lang], '-' * 110)) c = open(infile, 'r') raw = c.read() c.close() #Word level parsing listed = l.regex([lang, raw])[1].strip() length = len(listed) partition = int(length / 10) marker = [x * partition for x in range(0, 11)] for j, i in enumerate(marker): if j == i: continue path1 = 'corpus/training/' + str(j) + '/' path2 = 'corpus/testing/' + str(j) + '/' # print (path1,path2,filename); return if os.path.isfile(os.path.join(path1, filename)): os.remove(os.path.join(path1, filename)) if os.path.isfile(os.path.join(path2, filename)): os.remove(os.path.join(path2, filename)) testing = listed[marker[j - 1]:i] training = ' '.join([listed[:marker[j - 1]], listed[i:]]) tr = open(os.path.join(path1, filename), 'a+') ts = open(os.path.join(path2, filename), 'a+') ts.write(str(testing)) tr.write(str(training)) ts.close() tr.close() print( 'Corpus length: {}\t Training length: {}\t Testing Length: {}'. format(length, len(training), len(testing))) print( '\n{} - Completed creating and training and testing corpus {} in file {}' .format(datetime.datetime.now(), j, filename)) # print('training {} {}'.format(marker[j-1],len(listed[i:]))) # print('{} training {}'.format(j,training)) # print('testing {} {}'.format(j-1,i)) # print('{} testing {}'.format(j,testing)) else: print('\nStarted:', started) ended = datetime.datetime.now() elapsed = ended - started print('End :', ended) print('Elapsed:', elapsed) print('\n')