def load_data(): r = csv.reader(open(predict_file, 'r', encoding='utf-8'), delimiter=',', quotechar='"') raw_data = np.array(list(r)) s.print_data("raw", raw_data) str_data = np.delete(raw_data[1:, 1:], [6], axis=1) po_nums = raw_data[1:, 0] return s.normalize(str_data.astype('float32')), po_nums
def parse(tokens): """ parse tokenized query to simplified executable list structure based on Shunting-yard algorithm :param tokens: tokens of a query, including words and strings of 'AND', 'OR', 'NOT', '(', and ')' :return: parsed tokens """ # Parse special_tokens = ['AND', 'OR', 'NOT', '(', ')'] ops = { 'AND': 3, 'OR': 2, } # precedence output = [] op_stack = [] # Adapted from pseudo code provided in the Shunting-yard algorithm wikipedia page for i in range(len(tokens)): if not tokens[i] in special_tokens: word = normalize(tokens[i]) output.append(word) elif tokens[i] == 'NOT': op_stack.append('NOT') elif tokens[i] in ops: while op_stack and op_stack[-1] != '(' and ops[ op_stack[-1]] >= ops[tokens[i]]: output.append(op_stack.pop()) op_stack.append(tokens[i]) elif tokens[i] == '(': op_stack.append('(') elif tokens[i] == ')': while op_stack and op_stack[-1] != '(': output.append(op_stack.pop()) if op_stack and op_stack[-1] == '(': op_stack.pop() if op_stack and op_stack[-1] == 'NOT': output.append(op_stack.pop()) while op_stack: output.append(op_stack.pop()) # Simplify i = 0 while i < len(output): # Double negation is canceled if output[i] == 'NOT' and output[i - 1] == 'NOT': output[(i - 1):] = output[(i + 1):] # Use De Morgan's Law to take OR in NOT out elif output[i] == 'NOT' and output[i - 1] == 'OR': first_operand = get_operand(output, i - 1) second_operand = get_operand(output, i - 1 - len(first_operand)) remaining = i + 1 i = i - 2 - len(first_operand) - len(second_operand) output[(i + 1):] = second_operand + ['NOT'] + first_operand + [ 'NOT', 'AND' ] + output[remaining:] i = i + 1 return output
def resize_save_preview(self, outpath, target_shape=[4000, 4000]): h_, w_, *_ = self.shape rrate_ = min(target_shape[0]/h_, target_shape[1]/w_) target_shape = (np.array([h_, w_], dtype=Config.TYPE_FLOAT)*rrate_).astype(dtype=np.int) tmppath = path.join(Config.TEMP_DIR, str(uuid.uuid4())) resized = DiskMap(tmppath, dtype=self.dtype, mode='w+', shape=self.shape) resized[:] = self[:] resized = imresize(resized, target_shape) normalized = (normalize(resized)*255.).astype(np.uint8) imsave(outpath, exposure.rescale_intensity(normalized)) del normalized print('Resized preview saved', outpath, target_shape) DiskMap.remove_file(resized)
def bigram_inverted_index(collection): bigram_index = defaultdict(set) bigram_iindex = defaultdict(set) for doc in collection: # preprocess without lemmatization new_text = normalize(doc.full_text()) tokens = tokenize(new_text) text_tokens = remove_stop_word(tokens) for word in set(text_tokens): if word in bigram_index: continue bigrams = get_bigrams(word) bigram_index[word] = set(bigrams) for bigram in bigrams: bigram_iindex[bigram].add(word) return bigram_index, bigram_iindex
def load_process_data_b(): f1 = open(pos_file, 'r' ,encoding='utf-8') f2 = open(neg_file, 'r' ,encoding='utf-8') r1 = csv.reader(f1, delimiter=',', quotechar='"') r2 = csv.reader(f2, delimiter=',', quotechar='"') pos_data = process_b(np.array(list(r1))) neg_data = process_b(np.array(list(r2))) data = np.concatenate((pos_data, neg_data), axis=0) np.random.shuffle(data) #x_data = np.delete(data, [5], axis=1)#第5列是 violation 其实也就是train 中的y_data 因此删掉 x_data = s.normalize(np.delete(data, [5], axis=1)) #第5列是 violation 其实也就是train 中的y_data,因此删掉 #s.print_data("normalized data", x_data) y_data = data[:, 5] #len_data = data.shape[0] // 1280 * 1280 #print("25016 trimmed is %d" %(25016 // 1280 * 1280)) #len_data = data.shape[0]#hongwei 的改动,因为不明白为何需要 除以1280 取整后再 乘回来1280,数字少的时候结果就直接变0了,无法debug了 len_data = data.shape[0] // const_folds * const_folds #20156 --> 20150 f1.close(); f2.close() return x_data[:len_data], y_data[:len_data]
def load_process_data_m(): f1 = open(neg_file, 'r', encoding='utf-8') r1 = csv.reader(f1, delimiter=',', quotechar='"') #s.print_data("r1", r1) data_list=np.array(list(r1)) #s.print_data("data_list",data_list) data = process_m(data_list) #s.print_data("str_data3",data) #s=data.shape #s.print_data("s the shape",s) #len_data = data.shape[0] // 1280 * 1280 #len_data = data.shape[0] len_data = data.shape[0] // const_folds * const_folds #by hongwei: 20156 --> 20150 #print("=========================> len_data is %d" %(len_data)) x_data = s.normalize(np.delete(data, [5], axis=1)) #s.print_data("x_data after normalize",x_data) y_data = data[:, 5] #只取二维数组的第五列 #s.print_data("data[:, 5]",y_data) #s.print_data("y_data after takiing data[:, 5]", y_data) f1.close() #s.print_data("x_data[:len_data]",x_data[:len_data]) return x_data[:len_data], y_data[:len_data]
def build_index(in_dir, out_dict, out_postings): """ build index from documents stored in the input directory, then output the dictionary file and postings file """ print("indexing...") # Initializations dictionary = {} doc_freq = {} # Iterate through each file # Not directly using files.sorted() because this sorts # alphabetically instead of by index number files = os.listdir(in_dir) file_index = 0 block_count = 0 file_count = 0 while file_index < max_doc_id: # For each file, tokenize and normalize the words file_index = file_index + 1 if not str(file_index) in files: continue file_name = in_dir + "/" + str(file_index) reader = open(file_name, 'r') # Get the unprocessed words content = reader.read() reader.close() file_count += 1 words_in_doc = [] words = nltk.word_tokenize(content) for w in words: ws = w.split('/') for word in ws: word = normalize(word) if not word == "" and word not in words_in_doc: words_in_doc.append(word) for word in words_in_doc: if word not in dictionary: dictionary[word] = [] doc_freq[word] = 0 dictionary[word].append(file_index) doc_freq[word] += 1 # Divide the files into blocks, each block with block_size files # index every block by applying BSBI if file_count >= block_size: block_count += 1 temp_dict_path = "./temp_dict" + str(block_count) + ".txt" temp_posting_path = "./temp_post" + str(block_count) + ".txt" bsbi_invert(dictionary, doc_freq, temp_dict_path, temp_posting_path) dictionary = {} doc_freq = {} file_count = 0 if len(dictionary) >= 1: # Construct last block using BSBI block_count += 1 temp_dict_path = "./temp_dict" + str(block_count) + ".txt" temp_posting_path = "./temp_post" + str(block_count) + ".txt" bsbi_invert(dictionary, doc_freq, temp_dict_path, temp_posting_path) merge_block(block_count, out_dict, out_postings)
def slice_normalize(self, roi): roi = [min(roi[0], roi[2]), min(roi[1], roi[3]), max(roi[0], roi[2]), max(roi[1], roi[3])] w_ = roi[2] - roi[0] h_ = roi[3] - roi[1] sliced = np.array(self[roi[1]:roi[1]+h_, roi[0]:roi[0]+w_]).astype(dtype=Config.TYPE_FLOAT) return (normalize(sliced)*255.).astype(np.uint8)
def copy_normalize(self, signed=False): tmppath = path.join(Config.TEMP_DIR, str(uuid.uuid4())) copy = DiskMap(tmppath, dtype=Config.TYPE_FLOAT, mode='w+', shape=self.shape) copy[:] = self[:] normalize(copy, signed) return copy