def read_tsv(argv,file_encoding="latin-1"): filename = "../testdata/norwegian_words.txt" if len(argv) > 1: filename = argv[1] import codecs p2 = patricia() allwords = [] key_value = {} for line in codecs.open(filename, encoding=file_encoding): #print sys.stderr, "line = ", [line] word, value = line.strip().split("\t")#.lower() print >> sys.stderr, [word, value] word = json.loads(word.strip()) value = json.loads(value.strip()) nvalue = ",".join(value) allwords.append(word) if not p2.isWord(word): p2.addWord(word) key_value[word] = nvalue added = {} for word in allwords: if not added.has_key(word) and len(word) > 0: #p2.addVal(word, word[::-1]) p2.addVal(word, key_value[word]) #p2.addVal(word, word) added[word] = True return p2._data, allwords
def read_input(argv,file_encoding="latin-1"): filename = "../testdata/norwegian_words.txt" if len(argv) > 1: filename = argv[1] import codecs p2 = patricia() allwords = [] key_value = {} for line in codecs.open(filename, encoding=file_encoding): #print sys.stderr, "line = ", [line] #word, value = line.strip().split("\t")#.lower()wor word = json.loads(line.strip()) if '/' in word: print >> sys.stderr, "Skipping word:", [word] continue #word = json.loads(word.strip()) #value = json.loads(value.strip()) #nvalue = ",".join(value) if not p2.isWord(word): allwords.append(word) # or above? p2.addWord(word) added = {} for word in allwords: if not added.has_key(word) and len(word) > 0: p2.addVal(word, word) added[word] = True return p2._data, allwords
def read_tsv(argv, file_encoding="latin-1"): filename = "../testdata/norwegian_words.txt" if len(argv) > 1: filename = argv[1] import codecs p2 = patricia() allwords = [] key_value = {} for line in codecs.open(filename, encoding=file_encoding): #print sys.stderr, "line = ", [line] word, value = line.strip().split("\t") #.lower() word = json.loads(word.strip()) value = json.loads(value.strip()) nvalue = ",".join(value) allwords.append(word) if not p2.isWord(word): p2.addWord(word) key_value[word] = nvalue added = {} for word in allwords: if not added.has_key(word) and len(word) > 0: #p2.addVal(word, word[::-1]) p2.addVal(word, key_value[word]) #p2.addVal(word, word) added[word] = True return p2._data, allwords
def read_input(argv, file_encoding="latin-1"): filename = "../testdata/norwegian_words.txt" if len(argv) > 1: filename = argv[1] import codecs p2 = patricia() allwords = [] key_value = {} for line in codecs.open(filename, encoding=file_encoding): #print sys.stderr, "line = ", [line] #word, value = line.strip().split("\t")#.lower()wor word = json.loads(line.strip()) if '/' in word: print >> sys.stderr, "Skipping word:", [word] continue #word = json.loads(word.strip()) #value = json.loads(value.strip()) #nvalue = ",".join(value) if not p2.isWord(word): allwords.append(word) # or above? p2.addWord(word) added = {} for word in allwords: if not added.has_key(word) and len(word) > 0: p2.addVal(word, word) added[word] = True return p2._data, allwords
def reducer_init(self): try: self.all_keys = [] self.patricia_tree = patricia() self.keyvalue = {} #atbr.Atbr() except Exception, e: self.increment_counter("reducer_init", str(e), 1)
def read_input(argv,file_encoding="latin-1"): if len(argv) != 3: print "usage: %s <key_file> <key_value_file>" sys.exit(1) filename = argv[1] key_value_filename = argv[2] import codecs p2 = patricia() allwords = [] key_value = {} i = 0 for line in codecs.open(filename, encoding=file_encoding): #print sys.stderr, "line = ", [line] #word, value = line.strip().split("\t")#.lower()wor word = json.loads(line.strip()) #word = json.loads(word.strip()) #value = json.loads(value.strip()) #nvalue = ",".join(value) if not p2.isWord(word): allwords.append(word) # or above? p2.addWord(word) if i % 10000 == 0: print >> sys.stderr, "added key number ", i i += 1 added = {} j = 0 for word in allwords: if not added.has_key(word) and len(word) > 0: p2.addVal(word, word[::-1]) added[word] = True if j % 10000 == 0: print >> sys.stderr, "added key number ", j j += 1 return p2._data, allwords, key_value_filename
def read_input(argv, file_encoding="latin-1"): if len(argv) != 3: print "usage: %s <key_file> <key_value_file>" sys.exit(1) filename = argv[1] key_value_filename = argv[2] import codecs p2 = patricia() allwords = [] key_value = {} i = 0 for line in codecs.open(filename, encoding=file_encoding): #print sys.stderr, "line = ", [line] #word, value = line.strip().split("\t")#.lower()wor word = json.loads(line.strip()) #word = json.loads(word.strip()) #value = json.loads(value.strip()) #nvalue = ",".join(value) if not p2.isWord(word): allwords.append(word) # or above? p2.addWord(word) if i % 10000 == 0: print >> sys.stderr, "added key number ", i i += 1 added = {} j = 0 for word in allwords: if not added.has_key(word) and len(word) > 0: p2.addVal(word, word[::-1]) added[word] = True if j % 10000 == 0: print >> sys.stderr, "added key number ", j j += 1 return p2._data, allwords, key_value_filename
def big_test(argv,file_encoding="latin-1"): filename = "../testdata/norwegian_words.txt" if len(argv) > 1: filename = argv[1] import codecs p2 = patricia() allwords = [] for line in codecs.open(filename, encoding=file_encoding): #print sys.stderr, "line = ", [line] word = line.strip()#.lower() allwords.append(word) if not p2.isWord(word): p2.addWord(word) added = {} for word in allwords: if not added.has_key(word) and len(word) > 0: p2.addVal(word, word[::-1]) #p2.addVal(word, word) added[word] = True return p2._data, allwords
def big_test(argv, file_encoding="latin-1"): filename = "../testdata/norwegian_words.txt" if len(argv) > 1: filename = argv[1] import codecs p2 = patricia() allwords = [] for line in codecs.open(filename, encoding=file_encoding): #print sys.stderr, "line = ", [line] word = line.strip() #.lower() allwords.append(word) if not p2.isWord(word): p2.addWord(word) added = {} for word in allwords: if not added.has_key(word) and len(word) > 0: p2.addVal(word, word[::-1]) #p2.addVal(word, word) added[word] = True return p2._data, allwords