def dumpDictToPickle(self): #file = open(PATH_TO_RES + DICT + 'mueller4.txt') #file = open('../' + PATH_TO_RES + DICT + 'mueller4.txt', 'r', 'windows-1251') #test #file = codecs.open('../' + PATH_TO_RES + DICT + 'mueller4.txt', 'r', 'windows-1251') #test file = codecs.open(PATH_TO_RES + DICT + 'mueller4.txt', 'r', 'windows-1251') #test results_dict = redict({}) for line in file.readlines(): if not line.startswith('_'): word, trans = line.strip().split(' ') #trans = unicode.encode(trans, 'utf-8') trans = unicode(trans) results_dict[word] = { 'rus': trans.replace(trans[trans.find('['):trans.find(']') + 1], ''), 'eng': word } file.close() # dump = open('../' + PATH_TO_RES + RESULTING_DICT, 'w') dump = open(PATH_TO_RES + RESULTING_DICT, 'w') pickle.dump(results_dict, dump) del results_dict dump.close()
def parseTatoebaExamples(self): csv_examples = csv.reader(open(PATH_TO_RES + EXAMPLES), delimiter='\t') csv_links = csv.reader(open(PATH_TO_RES + LINKS), delimiter='\t') links = {} for link in csv_links: if links.has_key(link[0]): links[link[0]].append(link[1]) else: links[link[0]] = [link[1]] examples = {} for example in csv_examples: if example[1] == 'eng' or example[1] == 'rus': try: translation_ids = links[example[0]] examples[example[0]] = { 'lang': example[1], 'sentence': example[2], 'translation': translation_ids } except: pass examples_dictionary = redict({}) for item in examples: if examples[item]['lang'] == 'eng': for id in examples[item]['translation']: try: if examples[id]['lang'] == 'rus': examples_dictionary[examples[item]['sentence']] = { 'eng': examples[item]['sentence'], 'rus': examples[id]['sentence'] } except: pass del links del examples dump = open(PATH_TO_RES + RESULTING_DICTIONARY, 'w') pickle.dump(examples_dictionary, dump) del examples_dictionary dump.close() print 'well, well, well'
def parseTatoebaExamples(self): csv_examples = csv.reader(open(PATH_TO_RES + EXAMPLES), delimiter='\t') csv_links = csv.reader(open(PATH_TO_RES + LINKS), delimiter='\t') links = {} for link in csv_links: if links.has_key(link[0]): links[link[0]].append(link[1]) else: links[link[0]] = [ link[1] ] examples = {} for example in csv_examples: if example[1] == 'eng' or example[1] == 'rus': try: translation_ids = links[example[0]] examples[example[0]] = {'lang' : example[1], 'sentence' : example[2], 'translation' : translation_ids } except: pass examples_dictionary = redict({}) for item in examples: if examples[item]['lang'] == 'eng': for id in examples[item]['translation']: try: if examples[id]['lang'] == 'rus': examples_dictionary[examples[item]['sentence']] = { 'eng' : examples[item]['sentence'], 'rus' : examples[id]['sentence'] } except: pass del links del examples dump = open(PATH_TO_RES + RESULTING_DICTIONARY, 'w') pickle.dump(examples_dictionary, dump) del examples_dictionary dump.close() print 'well, well, well'
def dumpDictToPickle(self): #file = open(PATH_TO_RES + DICT + 'mueller4.txt') #file = open('../' + PATH_TO_RES + DICT + 'mueller4.txt', 'r', 'windows-1251') #test #file = codecs.open('../' + PATH_TO_RES + DICT + 'mueller4.txt', 'r', 'windows-1251') #test file = codecs.open(PATH_TO_RES + DICT + 'mueller4.txt', 'r', 'windows-1251') #test results_dict = redict({}) for line in file.readlines(): if not line.startswith('_'): word, trans = line.strip().split(' ') #trans = unicode.encode(trans, 'utf-8') trans = unicode(trans) results_dict[word] = { 'rus' : trans.replace(trans[trans.find('['): trans.find(']') + 1], ''), 'eng' : word } file.close() # dump = open('../' + PATH_TO_RES + RESULTING_DICT, 'w') dump = open(PATH_TO_RES + RESULTING_DICT, 'w') pickle.dump(results_dict, dump) del results_dict dump.close()
def __init__(self): self.dictionary = redict({})