def get(self): self.values["project"] = "http://www.proven-corporation.com/software/app-engine-console/" if self.values["subpage"] == "usage": for exampleNum in range(len(self.examples)): key = "example%d" % (exampleNum + 1) val = util.trim(self.examples[exampleNum]) val = pygments.highlight(val, self.resultLexer, self.outputFormatter).strip() self.values[key] = val elif self.values["subpage"] == "integration": self.values["example1"] = pygments.highlight( util.trim( """ def is_dev(): import os return os.environ['SERVER_SOFTWARE'].startswith('Dev') """ ), self.pythonLexer, self.outputFormatter, ).strip() self.values["example2"] = pygments.highlight( util.trim( """ >>> is_dev() True """ ), self.resultLexer, self.outputFormatter, ).strip()
def compute_similarity(path1, path2): """ Compute the similarity between two wav audio files Keyword arguments: path1 -- the path of the first audio file path2 -- the path of the second audio file Return: a tuple containing the similarity score, the name of the first file and the name of the second file """ # Read the audio files fs1, data1 = wavfile.read(path1) fs2, data2 = wavfile.read(path2) if fs1 != fs2: print('Files do not have the same sample frequency') return 0.0 # Align and sign the tracks track1 = trim(data1) track2 = trim(data2) signed1 = sign_track(track1, fs1) signed2 = sign_track(track2, fs2) # Sanitize the filenames filename1 = path1.split('/')[-1].split('.')[0] filename2 = path2.split('/')[-1].split('.')[0] return (signatures_similarity(signed1, signed2), filename1, filename2)
def format_enum_definition(enum_indent, base_indent, enum_definition): formatted_definition = '' enum_prefix = trim(enum_definition.split(open_brace)[0]) formatted_definition += enum_indent + enum_prefix + '\n' formatted_definition += enum_indent + open_brace enum_definition = trim(trim(trim(enum_definition)[len(enum_prefix):])[1:]) enum_definition = enum_definition.replace('};', '') entries = enum_definition.split(comma) for entry in entries: if entry != entries[0]: formatted_definition += comma formatted_definition += '\n' + enum_indent + base_indent + trim(entry) formatted_definition += '\n' + enum_indent + '};\n' return formatted_definition
def format_enum_definition(enum_indent, base_indent, enum_definition): formatted_definition = '' enum_prefix = trim(enum_definition.split(open_brace)[0]) formatted_definition += enum_indent + enum_prefix + '\n' formatted_definition += enum_indent + open_brace enum_definition = trim(trim(trim(enum_definition)[len(enum_prefix):])[1:]) enum_definition = enum_definition.replace('};','') entries = enum_definition.split(comma) for entry in entries: if entry != entries[0]: formatted_definition += comma formatted_definition += '\n' + enum_indent + base_indent + trim(entry) formatted_definition += '\n' + enum_indent + '};\n' return formatted_definition
def predict(self, word_int_arrs, mode='score'): """ Predict probability/label of aggression/loss of batch of word int arrs. """ sentences = [trim(x) for x in word_int_arrs] features = [] for sentence in sentences: representation = np.zeros((40000, )) for word_id in sentence: representation[word_id] += 1 features.append(representation) sparse_features = sparse.csr_matrix(features) aggression_pred_scores = self.classifiers[0].predict_proba( sparse_features)[:, 1] loss_pred_scores = self.classifiers[1].predict_proba( sparse_features)[:, 1] assert mode in ['score', 'binary'] if mode == 'score': return aggression_pred_scores, loss_pred_scores else: aggression_pred_labels = [ 1 if x >= self.thresholds[0] else 0 for x in aggression_pred_scores ] loss_pred_labels = [ 1 if x >= self.thresholds[1] else 0 for x in loss_pred_scores ] return aggression_pred_labels, loss_pred_labels
def get(self, request,task_name=None,format=None): #task_name = filter(None, request._request.path.split('/'))[-1] docstring = trim(task_docstring(task_name)) curl_url = reverse('run-main',kwargs={'task_name':task_name},request=request) #reverse("%s-run" % (task_name), request=request) data = {'task_name': task_name, 'task_docstring': docstring, 'task_url': curl_url, 'queue': 'celery'} return Response(data)
def get(self): self.values['project'] = 'http://www.proven-corporation.com/software/app-engine-console/' if self.values['subpage'] == 'usage': for exampleNum in range(len(self.examples)): key = 'example%d' % (exampleNum + 1) val = util.trim(self.examples[exampleNum]) val = pygments.highlight(val, self.resultLexer, self.outputFormatter).strip() self.values[key] = val elif self.values['subpage'] == 'integration': self.values['example1'] = pygments.highlight(util.trim(""" def is_dev(): import os return os.environ['SERVER_SOFTWARE'].startswith('Dev') """), self.pythonLexer, self.outputFormatter).strip() self.values['example2'] = pygments.highlight(util.trim(""" >>> is_dev() True """), self.resultLexer, self.outputFormatter).strip()
def print_databases(self): """Show all databases""" f = self.file head = util.read_meta(f,"hDatabases") db = Database(f,"_default") f.seek(head) db.addr = f.tell() bdata = f.read(52) db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata) db.name = util.trim(db.name) print db while db.next != 0: f.seek(db.next) db.addr = f.tell() bdata = f.read(52) db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata) db.name = util.trim(db.name) print db
def get(self, request,task_name=None,format=None): #task_name = filter(None, request._request.path.split('/'))[-1] docstring = trim(task_docstring(task_name)) curl_url = reverse('run-main',kwargs={'task_name':task_name},request=request) #reverse("%s-run" % (task_name), request=request) username= self.get_username(request) if not username == "guest": token = Token.objects.get_or_create(user=self.request.user) auth_token = str(token[0]) else: auth_token = "< authorized-token > " data = {'task_name': task_name, 'task_docstring': docstring, 'task_url': curl_url, 'queue': 'celery','auth_token':auth_token} return Response(data)
def pass_message(self): self._message_subject = \ util.trim(self._message['subject'].lower()) self._from = self._message['From'] self._date = self._message['Date'] self._body = self._message.get_payload(decode=True) try: (self._build, self._name) = \ util.get_branch_gitname(self._message_subject) except: self._valid = False else: self._valid = True
def find_last(self): """ Returns last database """ f = self.file head = util.read_meta(f,"tDatabases") db = Database(f) f.seek(head) db.addr = f.tell() bdata = f.read(52) db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata) db.name = util.trim(db.name) return db
def exists(self): """Checks for database existence""" f = self.file head = util.read_meta(f,"hDatabases") db = Database(f,"_default") f.seek(head) db.addr = f.tell() bdata = f.read(52) db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata) db.name = util.trim(db.name) while db.next != 0: f.seek(db.next) db.addr = f.tell() bdata = f.read(52) db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata) db.name = util.trim(db.name) if db.name == self.name: return True return False
def create_vectorized_representation(self, tweet_data): """ Return the count vectorized and tf-transformed representation of the input tweets. """ sentences = [ trim(tweet['word_padded_int_arr']) for tweet in tweet_data ] features = [] for sentence in sentences: representation = np.zeros((40000, )) for word_id in sentence: representation[word_id] += 1 features.append(representation) features = sparse.csr_matrix(features) return features
def create_adversarial_ELMo_representation(domain_specific, input_file, output_dir, parameter_dir=None): generated_sentences = pkl.load(open(input_file, 'rb')) revised_int_arrs = generated_sentences['generated_int_arr'] tweet_ids = generated_sentences['tweet_id'] tweet_dict = {} for idx in range(len(tweet_ids)): tweet_dict[tweet_ids[idx]] = {} tweet_dict[tweet_ids[idx]]['word_int_arr'] = trim( revised_int_arrs[idx]) create_ELMo_representation(tweet_dict, domain_specific=domain_specific, output_dir=output_dir, parameter_dir=parameter_dir)
def get_screen_captures(window, num_frames, from_top, from_bottom): screen_width = GetSystemMetrics(SM_CXSCREEN) screen_height = GetSystemMetrics(SM_CYSCREEN) box = (0, from_top, screen_width, screen_height - from_bottom) util.simple_mouse(MOUSEEVENTF_MOVE|MOUSEEVENTF_ABSOLUTE, screen_width / 2,\ screen_height / 2) util.simple_mouse(MOUSEEVENTF_LEFTDOWN, 0, 0) util.simple_mouse(MOUSEEVENTF_LEFTUP, 0, 0) time.sleep(0.1) SendKeys.SendKeys("{HOME}") for i in range(0, num_frames): img = ImageGrab.grab(box) img = util.trim(img) img.save('images/image%d.png' % i) SendKeys.SendKeys("{DOWN}")
def create_ELMo_representation(tweet_dicts, domain_specific, output_dir, masked_unigram_id=None, parameter_dir=None): """ Create ELMo representation for all labeled tweets with a certain unigram masked as UNK. The ELMo representations will be stored as .npy files for each tweet. """ if domain_specific: args = {} args['experiment_path'] = parameter_dir path = args['experiment_path'] args = pkl.load(open('../experiments/%s.param' % path, 'rb')) args['experiment_path'] = path bilm = create_bilm_from_args(args) else: words = pkl.load(open('../model/word.pkl', 'rb')) id2word = dict([(words[w]['id'], w.decode()) for w in words]) data_dir = '../data/' + output_dir if not os.path.exists(data_dir): os.mkdir(data_dir) for tweet_id in tweet_dicts: word_int_arr = trim(tweet_dicts[tweet_id]['word_int_arr'][:50]) #mask the specified unigram as UNK if masked_unigram_id is specified if masked_unigram_id is not None: for word_idx in range(len(word_int_arr)): if word_int_arr[word_idx] == masked_unigram_id: word_int_arr[word_idx] = 1 #use DS ELMo/NonDS ELMo code to generate corresponding elmo representation if domain_specific: elmo_rep = np.array([ x.detach().cpu().numpy()[0] for x in bilm.create_rep([word_int_arr]) ]) else: sentence = [id2word[idx] for idx in word_int_arr] elmo_rep = embed_one_sentence(sentence) np.save("%s%d.npy" % (data_dir, tweet_id), elmo_rep)
def extract_matrix_from_image(image, x_metrics, y_metrics): x_coords, y_coords = util.get_cell_boundaries(image) # Now extract the images of the cells (between the lines), based on the coordinates we just # calculated. When we have these extracted images, sum the pixel counts along the horizontal # and vertical axes. This will be used to compare with training data to identify the digits # in the cells pyplot.rcParams['figure.dpi'] = 50 bw_threshold = 160 # Make the image monochrome. If the original pixel value > threshold, 1, otherwise 0. (thresh, monochrome_image) = cv2.threshold(image, bw_threshold, 1, cv2.THRESH_BINARY_INV) puzzle_matrix = [] for y_coord in y_coords: new_matrix_row = [] for x_coord in x_coords: raw_image = monochrome_image[y_coord[0]:y_coord[1], x_coord[0]:x_coord[1]] image_height, image_width = raw_image.shape image_sum = raw_image.sum() image_density = image_sum / (image_width * image_height) # If the image density (% of black pixels in the image) is less than a certain threshold # we assume the cell is empty and return 0. This is not a test for 0 % since there can be # noise in the image. If above the threshold, then determine the number from training data if image_density < 0.001: number = 0 else: # show_image(raw_image, # title="Y - %s:%s. X - %s:%s" % (y_coord[0], y_coord[1], x_coord[0], x_coord[1])) cell_image = util.trim(raw_image) if cell_image is None: number = 0 else: number = get_number_from_image(cell_image, x_metrics, y_metrics) print("Number: %s" % number) new_matrix_row.append(number) puzzle_matrix.append(new_matrix_row) print("\n") return puzzle_matrix
def make_junto(filename, title, folder='.', threshold=0, ret=False): if not '_spread' in filename: print 'Spreading file...\n' filename = util.spread(filename) print 'Making graph...' g = graph.make_graph(filename) if threshold: print '\nTrimming...' g['procedures'] = util.trim(g['procedures'], threshold) print '\nConstructing junto files...' graph.to_junto(g, title) with open(title + '_config', 'w') as f: f.write(config % {'title': title, 'folder': folder}) print '\n...Done' if ret: return g
def compute_all_similarities(folder): """ Compute the similarity between all pairs of wav audio files in the folder Keyword arguments: folder -- the folder where the audio files are located Return: a sorted list of tuples containing the similiarity score, the name of the first file and the name of the second file """ signatures = {} similarities = [] reference_fs, _ = wavfile.read(folder + '/' + os.listdir(folder)[0]) # Sign all the audio files in the folder for filename in os.listdir(folder): fs, data = wavfile.read(folder + '/' + filename) if fs != reference_fs: print(filename, 'does not have the same sample frequency') else: track = trim(data) signature = sign_track(data, fs) signatures[filename] = signature # Compute the similarity score between each pair of signed tracks for idx, key1 in enumerate(list(signatures.keys())): signed1 = signatures[key1] for key2 in list(signatures.keys())[idx + 1:]: signed2 = signatures[key2] similarity = signatures_similarity(signed1, signed2) similarities.append( (similarity, key1.split('.')[0], key2.split('.')[0])) similarities.sort(reverse=True) return similarities
def create_natural_sentences(self, mode, token, tweet_dicts): assert mode in ['insert', 'replace'] token_id = self.dl.token2property[token.encode("utf-8")]['id'] sentence_outputs = {} keys = [ 'original_sentence', 'generated_sentence', 'original_prob', 'generated_prob', 'original_int_arr', 'generated_int_arr', 'tweet_id' ] for key in keys: sentence_outputs[key] = [] for tweet_id in tweet_dicts.keys(): sentence = tweet_dicts[tweet_id]['word_padded_int_arr'] num_words = sum([x != 0 for x in sentence]) if mode == 'insert': if num_words == 50: #already max length, cannot add more words continue idx_range = range(num_words + 1) else: idx_range = range(num_words) sentence_outputs['original_int_arr'].append(np.array(sentence)) original_sentence_unicode = self.dl.convert2unicode(trim(sentence)) sentence_outputs['original_sentence'].append( original_sentence_unicode) original_sentence_prob = self.compute_log_prob([trim(sentence)]) sentence_outputs['original_prob'].append(original_sentence_prob) sentence_outputs['tweet_id'].append(tweet_id) max_generated_prob = -np.inf most_natural_generated_sentence = None for pos in idx_range: if mode == 'insert': generated_sentence = insert_element( sentence, pos, token_id) else: generated_sentence = np.array(sentence) generated_sentence[pos] = token_id new_sentence_prob = self.compute_log_prob( [trim(generated_sentence)]) if new_sentence_prob > max_generated_prob: max_generated_prob = new_sentence_prob most_natural_generated_sentence = generated_sentence most_natural_revised_sentence_unicode = self.dl.convert2unicode( trim(most_natural_generated_sentence)) sentence_outputs['generated_sentence'].append( most_natural_revised_sentence_unicode) sentence_outputs['generated_prob'].append(max_generated_prob) sentence_outputs['generated_int_arr'].append( np.array(most_natural_generated_sentence)) if len(sentence_outputs['generated_int_arr']) % 100 == 0: print(len(sentence_outputs['generated_int_arr'])) pkl.dump( sentence_outputs, open( "../adversarial_data/%s_%s_natural_sentence_%s.pkl" % (mode, token, self.dataset), 'wb')) #order the records in order of maximum probability increase to minimum probability increase prob_diff = np.array(sentence_outputs['generated_prob']) - np.array( sentence_outputs['original_prob']) sorted_idx = np.argsort(prob_diff)[::-1] for key in sentence_outputs.keys(): sentence_outputs[key] = [ sentence_outputs[key][idx] for idx in sorted_idx ] sentence_outputs['prob_change'] = np.array( sentence_outputs['generated_prob']) - np.array( sentence_outputs['original_prob']) pd.DataFrame.from_dict(sentence_outputs).to_csv( "../showable/%s_%s_natural_sentence_%s.csv" % (mode, token, self.dataset), index=False) pkl.dump( sentence_outputs, open( "../adversarial_data/%s_%s_natural_sentence_%s.pkl" % (mode, token, self.dataset), 'wb'))
registry.merge_developers() # Link developers igdb.link_developers() steam.link_developers() # Merge articles registry.merge_articles() # Merge videos registry.merge_videos() # Merge tweets registry.merge_tweets() trim() WS.flush() print("[MAIN] Rebuild completed in %d seconds" % (time() - t)) elif action == '2': WS.flush() elif action == '3': registry.merge_games() elif action == '4': registry.merge_developers() elif action == '5': registry.merge_articles() elif action == '6': registry.merge_videos() elif action == '7':
def lime(self): tweet_idx_word_idx2idx, perturbed_tests, idx2sent_length = self.create_perturbation_samples( self.tweet_records) (idx2max_min_wordidx, first_round_unigram_ranking, first_round_max_influences, first_round_all_influences) \ = self.analyze_perturbations_influence(tweet_idx_word_idx2idx, perturbed_tests, idx2sent_length, round=1, observe_word_ids=self.unigram_observe_ids) self.masked_tweet_records = {} for key in self.tweet_records.keys(): if key != 'word_content_input_elmo' or self.pad_elmo is True: self.masked_tweet_records[key] = np.array( self.tweet_records[key]) else: self.masked_tweet_records[key] = self.tweet_records[key] for idx in range(len(idx2max_min_wordidx)): self.masked_tweet_records['word_content_input'][idx][ idx2max_min_wordidx[idx][2]] = 1 #mask insignificant unigram if self.input_format == 'discrete': tweet_idx_word_idx2idx, perturbed_tests, idx2sent_length = self.create_perturbation_samples( self.masked_tweet_records) else: elmo_masked_wordidx = [ idx2max_min_wordidx[idx][2] for idx in range(len(idx2max_min_wordidx)) ] tweet_idx_word_idx2idx, perturbed_tests, idx2sent_length = self.create_perturbation_samples( self.masked_tweet_records, elmo_masked_idx=elmo_masked_wordidx) observe_word_idx = {} for idx in range(len(idx2max_min_wordidx)): observe_word_idx[idx] = idx2max_min_wordidx[idx][1] second_round_idx2max_min_wordidx, second_round_ranking, second_round_max_influences, second_round_all_influences = \ self.analyze_perturbations_influence(tweet_idx_word_idx2idx, perturbed_tests, idx2sent_length, round=2, observe_word_position_idx=observe_word_idx) data = {} data['original tweet'] = [ self.dl.convert2unicode(trim(arr)) for arr in self.tweet_records['word_content_input'] ] data['masked tweet'] = [ self.dl.convert2unicode(trim(arr)) for arr in self.masked_tweet_records['word_content_input'] ] data['first round influences'] = first_round_all_influences data['first round max influential unigram'] = [ self.dl.convert2unicode([ self.tweet_records['word_content_input'][idx][ idx2max_min_wordidx[idx][1]] ]) for idx in range(len(idx2sent_length)) ] data['first round most insignificant unigram'] = [ self.dl.convert2unicode([ self.tweet_records['word_content_input'][idx][ idx2max_min_wordidx[idx][2]] ]) for idx in range(len(idx2sent_length)) ] data['first round max influence'] = first_round_max_influences data['second round influences'] = second_round_all_influences data['second round most influential unigram'] = [ self.dl.convert2unicode([ self.tweet_records['word_content_input'][idx][ second_round_idx2max_min_wordidx[idx][1]] ]) for idx in range(len(idx2sent_length)) ] data['second round max influence'] = second_round_max_influences data[ 'first round max influential unigram ranking in second round'] = second_round_ranking if self.unigram_observe_ids is not None: for unigram_id in self.unigram_observe_ids: data['first round unigram %s ranking' % id2word[unigram_id]] = first_round_unigram_ranking[ unigram_id] pd.DataFrame.from_dict(data).to_csv(self.output_dir, index=False) second_round_rank_stats = defaultdict(int) for num in second_round_ranking: second_round_rank_stats[num] += 1 #first_round_unigram_ranking uses -1 to indicate that the specified unigram not in the tweet #filter out these ranking -1 to get rankings for only those tweets that include the specified unigram first_round_unigram_ranking_included = {} for unigram_id in self.unigram_observe_ids: first_round_unigram_ranking_included[unigram_id] = [] for i in first_round_unigram_ranking[unigram_id]: if i != -1: first_round_unigram_ranking_included[unigram_id].append(i) first_round_rank_stats = {} for unigram_id in self.unigram_observe_ids: stats = defaultdict(int) for num in first_round_unigram_ranking_included[unigram_id]: stats[num] += 1 first_round_rank_stats[unigram_id] = stats return { 'unigram_rank_stats': first_round_rank_stats, 'lime_consistency_stats': second_round_rank_stats, 'first_round_all_influences': first_round_all_influences, 'correspondence': self.tweet_id_considered }
def forward(self, X): if self.input_format == 'discrete': sentence = Variable(torch.LongTensor(trim( X['word_content_input']))) embedded_val = self.dropout_emb(self.wemb(sentence[:, None])) else: #permute axis order for convenience of model forward #Elmo reorder of dimension axis is performed by the model itself. elmo_rep_permuted = np.swapaxes( np.swapaxes(X['word_content_input_elmo'], 0, 2), 0, 1) if self.input_format == "elmo": self.normalized_weight = F.softmax(self.kernel_weight, dim=0) if self.elmo_option == 'add_average': self.second_normalized_weight = F.softmax( self.second_kernel_weight, dim=0) sentence = Variable(torch.FloatTensor(elmo_rep_permuted)) average_repr = torch.squeeze(sentence @ self.normalized_weight, dim=2) if self.elmo_option == 'add_average': all_repr = torch.cat((sentence, average_repr.unsqueeze(dim=2)), dim=2) average_repr = torch.squeeze( all_repr @ self.second_normalized_weight, dim=2) average_repr = torch.unsqueeze(average_repr, dim=1) embedded_val = self.dropout_elmo(average_repr) elif self.input_format == 'both': self.normalized_weight = F.softmax(self.kernel_weight, dim=0) if self.elmo_option == 'add_average': self.second_normalized_weight = F.softmax( self.second_kernel_weight, dim=0) #discrete input format features sentence = Variable(torch.LongTensor(trim( X['word_content_input']))) discrete_embedded_val = self.dropout_emb( self.wemb(sentence[:, None])).permute(0, 2, 1) #elmo input format features sentence = Variable(torch.FloatTensor(elmo_rep_permuted)) sentence = torch.cat((discrete_embedded_val, sentence), dim=2) average_repr = torch.squeeze(sentence @ self.normalized_weight, dim=2) if self.elmo_option == 'add_average': all_repr = torch.cat((sentence, average_repr.unsqueeze(dim=2)), dim=2) average_repr = torch.squeeze( all_repr @ self.second_normalized_weight, dim=2) average_repr = torch.unsqueeze(average_repr, dim=1) embedded_val = self.dropout_elmo(average_repr) hs, (q, _) = self.encoder(embedded_val.float()) if self.use_attn: hs = self.dropout_lstm(hs) else: q = self.dropout_lstm(q) hs = hs.squeeze_(dim=1) # get concatenated representation of hidden states with/without context features concatenated_representation = [] for hidden_state in hs: if self.context_feature_before: for feature_name in self.context_feature_names: hidden_state = torch.cat( (hidden_state, torch.Tensor(X[feature_name]))) concatenated_representation.append(hidden_state) else: concatenated_representation.append(hidden_state) concatenated_representation = torch.stack(concatenated_representation) if self.verbose: print("concatenated hidden states' shape: " + str(concatenated_representation.shape)) # get concatenated representation of last state of LSTM with/without context featuress concatenated_q = q.view(1, -1)[0] if self.context_feature_before: for feature_name in self.context_feature_names: concatenated_q = torch.cat( (concatenated_q, torch.Tensor(X[feature_name]))) if self.verbose: print("concatenated last state's shape: " + str(concatenated_q.shape)) # calculating attntion if not self.use_attn: z = concatenated_q else: A = F.softmax(torch.tanh(self.W_a(concatenated_representation)), dim=0) z = (torch.transpose(A, 0, 1) @ concatenated_representation)[0] if self.verbose: print("feature representation shape: " + str(z.shape)) if self.context_feature_last: for feature_name in self.context_feature_names: z = torch.cat((z, torch.Tensor(X[feature_name]))) if self.verbose: print("last tensor representation shape: " + str(z.shape)) output = torch.sigmoid(self.fc(z)).view(-1, ) return_dict = {'output': output} if self.use_attn: return_dict['attn'] = A.view(1, -1) return return_dict
__author__ = 'jens' import util # Read multi line input and copies all unique lines to the clipboard # This will also trim all leading and trailing spaces multiLineInput = util.raw_multi_line_input() uniqueSorted = util.trim(sorted(util.create_set(multiLineInput, lambda s: s.lower()), key=str.lower)) util.copy_to_clipboard(uniqueSorted)
def main(): if len(sys.argv) < 3: raise Exception( "Input parameter 1: folder of folders of training data images. Input parameter 2: folder for JSON output representing image metrics." ) # input fold of images organized into folders named by the number in in the image input_folder = sys.argv[1] # output folder for json metrics output_folder = sys.argv[2] image_data = {} for i in range(1, 10): image_data[i] = {} image_data[i]['raw_data'] = [] image_data[i]['normalized_data'] = [] # These are to calculate average image width and height number_of_items = 0 width_total = 0 height_total = 0 for image_folder_name in listdir(input_folder): image_folder = "%s/%s" % (input_folder, image_folder_name) if os.path.isdir(image_folder): print("Processing: %s..." % image_folder_name) # current_number is the number in the images in this folder current_number = int(image_folder_name) for image_file in listdir(image_folder): image_file_name, image_file_extension = os.path.splitext( image_file) if image_file_extension == '' or image_file_name[0] == '.': print("Skipping, not an image file.") else: image = cv2.imread("%s/%s" % (image_folder, image_file), cv2.IMREAD_GRAYSCALE) trimmed_image = util.trim(image) # This is summing columns vertically x_counts = trimmed_image.sum(axis=0) # This is summing rows horizontally y_counts = trimmed_image.sum(axis=1) image_size = len(x_counts) * len(y_counts) x_percentage = x_counts / image_size * 100 y_percentage = y_counts / image_size * 100 new_entry = { 'file_name': image_file, 'x': x_percentage.tolist(), 'y': y_percentage.tolist() } image_data[current_number]['raw_data'].append(new_entry) number_of_items += 1 width_total += len(x_counts) height_total += len(y_counts) print("Finished folder: %s" % image_folder) normalized_width = int(width_total / number_of_items) normalized_height = int(height_total / number_of_items) image_data['normalized_size'] = { 'x': normalized_width, 'y': normalized_height } output_json(image_data, "%s/image_data.json" % output_folder) normalize_image_metrics(image_data) output_json(image_data, "%s/image_metrics.json" % output_folder)
__author__ = 'jens' import util # Read multi line input and copies all unique lines to the clipboard # This will also trim all leading and trailing spaces multiLineInput = util.raw_multi_line_input() uniqueSorted = util.trim( sorted(util.create_set(multiLineInput, lambda s: s.lower()), key=str.lower)) util.copy_to_clipboard(uniqueSorted)
elif findMinAndMax([7, 1, 3, 9, 5]) != (1, 9): print('测试失败!4') else: print('测试成功!') print('******************') for i, value in enumerate(['a', 'b', 'c']): print(i, value) d = {'a': 1, 'b': 2, 'c': 3} for key in d: print(key) print('******************') # 测试: if trim('hello ') != 'hello': print('1测试失败!', trim('hello ')) elif trim(' hello') != 'hello': print('2测试失败!', trim(' hello')) elif trim(' hello ') != 'hello': print('3测试失败!', trim(' hello ')) elif trim(' hello world ') != 'hello world': print('4测试失败!', trim(' hello world ')) elif trim('') != '': print('5测试失败!', trim('')) elif trim(' ') != '': print('6测试失败!', trim(' ')) else: print('测试成功!') print('******************')
def evaluate_model_prediction(self, token, model_id, run_idx, fold_idx, class_idx, mode='binary', top_num=800): generated_sentences = pkl.load( open( "../adversarial_data/insert_%s_natural_sentence_labeled.pkl" % token, 'rb')) original_int_arrs = generated_sentences['original_int_arr'][:top_num] revised_int_arrs = generated_sentences['generated_int_arr'][:top_num] tweet_ids = generated_sentences['tweet_id'][:top_num] all_tweets = self.dl.all_data() original_tweets = [] generated_tweets = [] tweetid2tweetidx = {} for idx in range(len(all_tweets)): tweetid2tweetidx[all_tweets[idx]['tweet_id']] = idx for idx in range(len(original_int_arrs)): tweet = all_tweets[tweetid2tweetidx[tweet_ids[idx]]] original_tweets.append(tweet) generated_tweet = deepcopy(tweet) assert np.all(generated_tweet['word_padded_int_arr'] == original_int_arrs[idx]) generated_tweet['word_padded_int_arr'] = revised_int_arrs[idx] generated_tweet['word_int_arr'] = trim( generated_tweet['word_padded_int_arr']) generated_tweets.append(generated_tweet) generated_elmo_dir = None original_elmo_dir = None if model_id in (3, 4, 6, 7): #DS ELMo generated_elmo_dir = "../adversarial_data/DS_ELMo_adversarial_insert_%s" % token original_elmo_dir = "../data/DS_ELMo_rep" if model_id == 5: #NonDS ELMo generated_elmo_dir = "../adversarial_data/NonDS_ELMo_adversarial_insert_%s" % token original_elmo_dir = "../data/NonDS_ELMo_rep" load_model_tweet_dicts(model_id, generated_tweets, elmo_dir=generated_elmo_dir) generated_tweet_X = pkl.load( open("../data/adversarial_tweet_X.pkl", 'rb')) load_model_tweet_dicts(model_id, original_tweets, elmo_dir=original_elmo_dir) original_tweet_X = pkl.load( open("../data/adversarial_tweet_X.pkl", 'rb')) model = load_model(model_id, run_idx, fold_idx, class_idx) original_predictions = model.predict(original_tweet_X) generated_predictions = model.predict(generated_tweet_X) assert mode in ['score', 'binary'] if mode == 'score': # analyze prediction numerical score change return original_predictions, generated_predictions else: # analyze label flipping threshold = get_model_info(num_runs=5, num_folds=5, num_models=model_id)['thresholds'][( model_id, run_idx)][class_idx][fold_idx] original_pred_labels = [ 1 if x >= threshold else 0 for x in original_predictions ] generated_pred_labels = [ 1 if x >= threshold else 0 for x in generated_predictions ] new_positive_tweet_ids = [] new_negative_tweet_ids = [] for idx in range(len(original_predictions)): if original_pred_labels[idx] == 0 and generated_pred_labels[ idx] == 1: new_positive_tweet_ids.append( original_tweets[idx]['tweet_id']) if original_pred_labels[idx] == 1 and generated_pred_labels[ idx] == 0: new_negative_tweet_ids.append( original_tweets[idx]['tweet_id']) return len(new_positive_tweet_ids)
def sanity_check(self): # For each two adjacent tweets, switch the word on every positions and see if both tweets' log probability # decrease most of the time tweet_ids = list(self.dl.data['data'].keys()) count_prob_decrease = 0 # number of times the revised sentence has lower probability than original sentence count_prob_increase = 0 # number of times the revised sentence has higher probability than original sentence prob_increase_samples = {} prob_increase_samples['original'] = [] prob_increase_samples['revised'] = [] prob_increase_samples['original score'] = [] prob_increase_samples['revised score'] = [] for idx in range(len(tweet_ids) - 1): tweet_id1 = tweet_ids[idx] tweet_id2 = tweet_ids[idx + 1] sentence1 = trim( self.dl.data['data'][tweet_id1]['word_padded_int_arr']) sentence2 = trim( self.dl.data['data'][tweet_id2]['word_padded_int_arr']) log_prob_sentence1 = self.compute_log_prob([sentence1]) log_prob_sentence2 = self.compute_log_prob([sentence2]) for word_idx in range(min(len(sentence1), len(sentence2))): # swap the two sentences word on this position sentence1[word_idx], sentence2[word_idx] = sentence2[ word_idx], sentence1[word_idx] log_prob_revised_sentence1 = self.compute_log_prob([sentence1]) log_prob_revised_sentence2 = self.compute_log_prob([sentence2]) if log_prob_revised_sentence1 <= log_prob_sentence1: count_prob_decrease += 1 else: count_prob_increase += 1 prob_increase_samples['revised'].append( self.dl.convert2unicode(sentence1)) prob_increase_samples['revised score'].append( log_prob_revised_sentence1) prob_increase_samples['original score'].append( log_prob_sentence1) if log_prob_revised_sentence2 <= log_prob_sentence2: count_prob_decrease += 1 else: count_prob_increase += 1 prob_increase_samples['revised'].append( self.dl.convert2unicode(sentence2)) prob_increase_samples['revised score'].append( log_prob_revised_sentence2) prob_increase_samples['original score'].append( log_prob_sentence2) # recover the original sentence sentence1[word_idx], sentence2[word_idx] = sentence2[ word_idx], sentence1[word_idx] if log_prob_revised_sentence1 > log_prob_sentence1: prob_increase_samples['original'].append( self.dl.convert2unicode(sentence1)) if log_prob_revised_sentence2 > log_prob_sentence2: prob_increase_samples['original'].append( self.dl.convert2unicode(sentence2)) if idx % 10 == 0: print("increase: ", count_prob_decrease) print("decrease: ", count_prob_increase) if idx > 100: break print("Probability decrease: ", count_prob_decrease) print("Probability increase: ", count_prob_increase) pd.DataFrame.from_dict(prob_increase_samples).to_csv( "../showable/ELMo_sanity_check.csv", index=False)
print(fixer.candidates("incoerrct")) # Output: incorrect # Fix a string, results print on console fixer.fix("Thsi is a sentnce taht full of mistakes") # Initialize new fixer # rt1.txt and rt2.txt are files that contain some random texts for testing # Some words are spelled incorrectly in order to test the fixer f = fixer.Fixer( dir, # The root dir recursive=True, # Recursively check all the files fname=["rt1", "rt2"]) # Only certain files will be checked # Start fixing files, results print on console f.fix() import util # print all files under resources folder print(util.find(dir)) # print files with given filenames print(util.find(dir, fname=["rt1"])) # print files with given extensions print(util.find(dir, suffix=[".txt"])) print(util.trim("exam?ple!")) # output: example print(util.words("Separate words from sentence")) # Output: ['separate', 'words', 'from', 'sentence'] # Find all the comments in a py file, here use corrector.py as an example examplepy = util.find(dir, fname=["corrector"])[0] print(util.find_comment_py(examplepy))