Esempio n. 1
0
File: main.py Progetto: smeylan/opus
# programmatic interface for open_sub_2013

slowstoragedir = '/shared_hd0/corpora/OPUS/2013_OPUS'
languages = ['ru'] #'cs','pl','ro','sv'
numItems = 25000
import opus, pdb, os, ngrok



directories = opus.makeDirectoryStructure('/shared_hd0/corpora/OPUS/2013_OPUS')
[opus.downloadLanguage(x, directories['downloadpath'], directories['expandpath']) for x in  languages]

[opus.processLanguage(x, directories['expandpath'], directories['combinedpath']) for x in  languages]
[opus.combineLanguage(os.path.join(directories['combinedpath'], x), os.path.join(directories['intermediatecountpath'],x+'_combined.txt')) for x in  languages]


for language in languages:
	print('Counting ngrams...')
	countfile = os.path.join(directories['intermediatecountpath'],language+'_counted.txt')
	ngrok.countNgrams(os.path.join(directories['intermediatecountpath'],language+'_combined.txt'), countfile, n=1)

	print('Rearranging ngrams...')
	rearrangedFile = os.path.join(directories['intermediatecountpath'],language+'_2013_rearrange_counts.txt')
	ngrok.rearrangeNgramFile(countfile, rearrangedFile , reverse=False)

	print('Cleaning ngrams...')
	cleanedFile =  os.path.join(directories['intermediatecountpath'],language+'_2013_cleaned.txt')	
	ngrok.cleanUnigramCountFile(rearrangedFile, cleanedFile, numItems, language)	

	print('Sorting ngrams...')
	sortedfile =  os.path.join(directories['intermediatecountpath'],language+'_2013_sorted.txt')	
Esempio n. 2
0
File: cli.py Progetto: smeylan/opus
def processLanguage(language, expandpath, outputdir):
	'''Run extract text on a large number of .gz files'''
	opus.processLanguage(language, expandpath, outputdir)