Exemple #1
0
    def get(self):
        self.values["project"] = "http://www.proven-corporation.com/software/app-engine-console/"

        if self.values["subpage"] == "usage":
            for exampleNum in range(len(self.examples)):
                key = "example%d" % (exampleNum + 1)
                val = util.trim(self.examples[exampleNum])
                val = pygments.highlight(val, self.resultLexer, self.outputFormatter).strip()
                self.values[key] = val
        elif self.values["subpage"] == "integration":
            self.values["example1"] = pygments.highlight(
                util.trim(
                    """
                def is_dev():
                    import os
                    return os.environ['SERVER_SOFTWARE'].startswith('Dev')
            """
                ),
                self.pythonLexer,
                self.outputFormatter,
            ).strip()
            self.values["example2"] = pygments.highlight(
                util.trim(
                    """
                >>> is_dev()
                True
            """
                ),
                self.resultLexer,
                self.outputFormatter,
            ).strip()
Exemple #2
0
def compute_similarity(path1, path2):
    """
    Compute the similarity between two wav audio files

    Keyword arguments:
    path1 -- the path of the first audio file
    path2 -- the path of the second audio file

    Return:
    a tuple containing the similarity score, the name of the first file and the name of the second file
    """

    # Read the audio files
    fs1, data1 = wavfile.read(path1)
    fs2, data2 = wavfile.read(path2)
    if fs1 != fs2:
        print('Files do not have the same sample frequency')
        return 0.0

    # Align and sign the tracks
    track1 = trim(data1)
    track2 = trim(data2)
    signed1 = sign_track(track1, fs1)
    signed2 = sign_track(track2, fs2)

    # Sanitize the filenames
    filename1 = path1.split('/')[-1].split('.')[0]
    filename2 = path2.split('/')[-1].split('.')[0]
    return (signatures_similarity(signed1, signed2), filename1, filename2)
def format_enum_definition(enum_indent, base_indent, enum_definition):
    formatted_definition = ''
    enum_prefix = trim(enum_definition.split(open_brace)[0])
    formatted_definition += enum_indent + enum_prefix + '\n'
    formatted_definition += enum_indent + open_brace

    enum_definition = trim(trim(trim(enum_definition)[len(enum_prefix):])[1:])
    enum_definition = enum_definition.replace('};', '')
    entries = enum_definition.split(comma)
    for entry in entries:
        if entry != entries[0]:
            formatted_definition += comma
        formatted_definition += '\n' + enum_indent + base_indent + trim(entry)
    formatted_definition += '\n' + enum_indent + '};\n'
    return formatted_definition
def format_enum_definition(enum_indent, base_indent, enum_definition):
    formatted_definition = ''
    enum_prefix = trim(enum_definition.split(open_brace)[0])
    formatted_definition += enum_indent + enum_prefix + '\n'
    formatted_definition += enum_indent + open_brace

    enum_definition = trim(trim(trim(enum_definition)[len(enum_prefix):])[1:])
    enum_definition = enum_definition.replace('};','')
    entries = enum_definition.split(comma)
    for entry in entries:
        if entry != entries[0]:
            formatted_definition += comma
        formatted_definition += '\n' + enum_indent + base_indent + trim(entry)
    formatted_definition += '\n' + enum_indent + '};\n'
    return formatted_definition
    def predict(self, word_int_arrs, mode='score'):
        """
        Predict probability/label of aggression/loss of batch of word int arrs.
        """
        sentences = [trim(x) for x in word_int_arrs]
        features = []
        for sentence in sentences:
            representation = np.zeros((40000, ))
            for word_id in sentence:
                representation[word_id] += 1
            features.append(representation)
        sparse_features = sparse.csr_matrix(features)
        aggression_pred_scores = self.classifiers[0].predict_proba(
            sparse_features)[:, 1]
        loss_pred_scores = self.classifiers[1].predict_proba(
            sparse_features)[:, 1]

        assert mode in ['score', 'binary']
        if mode == 'score':
            return aggression_pred_scores, loss_pred_scores
        else:
            aggression_pred_labels = [
                1 if x >= self.thresholds[0] else 0
                for x in aggression_pred_scores
            ]
            loss_pred_labels = [
                1 if x >= self.thresholds[1] else 0 for x in loss_pred_scores
            ]
            return aggression_pred_labels, loss_pred_labels
Exemple #6
0
 def get(self, request,task_name=None,format=None):
     #task_name = filter(None, request._request.path.split('/'))[-1]
     docstring = trim(task_docstring(task_name))
     curl_url = reverse('run-main',kwargs={'task_name':task_name},request=request)
     #reverse("%s-run" % (task_name), request=request)
     data = {'task_name': task_name, 'task_docstring': docstring, 'task_url': curl_url, 'queue': 'celery'}
     return Response(data)
Exemple #7
0
    def get(self):
        self.values['project'] = 'http://www.proven-corporation.com/software/app-engine-console/'

        if self.values['subpage'] == 'usage':
            for exampleNum in range(len(self.examples)):
                key = 'example%d' % (exampleNum + 1)
                val = util.trim(self.examples[exampleNum])
                val = pygments.highlight(val, self.resultLexer, self.outputFormatter).strip()
                self.values[key] = val
        elif self.values['subpage'] == 'integration':
            self.values['example1'] = pygments.highlight(util.trim("""
                def is_dev():
                    import os
                    return os.environ['SERVER_SOFTWARE'].startswith('Dev')
            """), self.pythonLexer, self.outputFormatter).strip()
            self.values['example2'] = pygments.highlight(util.trim("""
                >>> is_dev()
                True
            """), self.resultLexer, self.outputFormatter).strip()
Exemple #8
0
	def print_databases(self):
		"""Show all databases"""
		f = self.file
		head = util.read_meta(f,"hDatabases")

		db = Database(f,"_default")
		f.seek(head)
		db.addr = f.tell()
		bdata = f.read(52)
		db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata)
		db.name = util.trim(db.name)
		print db
		while db.next != 0:
			f.seek(db.next)
			db.addr = f.tell()
			bdata = f.read(52)
			db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata)
			db.name = util.trim(db.name)
			print db
Exemple #9
0
 def get(self, request,task_name=None,format=None):
     #task_name = filter(None, request._request.path.split('/'))[-1]
     docstring = trim(task_docstring(task_name))
     curl_url = reverse('run-main',kwargs={'task_name':task_name},request=request)
     #reverse("%s-run" % (task_name), request=request)
     username= self.get_username(request)
     if not username == "guest":
         token = Token.objects.get_or_create(user=self.request.user)
         auth_token = str(token[0])
     else:
         auth_token = "< authorized-token > "
     data = {'task_name': task_name, 'task_docstring': docstring, 'task_url': curl_url, 'queue': 'celery','auth_token':auth_token}
     return Response(data)
Exemple #10
0
	def pass_message(self):
		self._message_subject = \
		          util.trim(self._message['subject'].lower())
		self._from = self._message['From']
		self._date = self._message['Date']
		self._body = self._message.get_payload(decode=True)
		try:
			(self._build, self._name) = \
			util.get_branch_gitname(self._message_subject)
		except:
			self._valid = False
		else:
			self._valid = True
Exemple #11
0
	def find_last(self):
		"""
			Returns last database
		"""
		f = self.file
		head = util.read_meta(f,"tDatabases")

		db = Database(f)
		f.seek(head)
		db.addr = f.tell()
		bdata = f.read(52)
		db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata)
		db.name = util.trim(db.name)
		return db			
Exemple #12
0
	def exists(self):
		"""Checks for database existence"""

		f = self.file
		head = util.read_meta(f,"hDatabases")
		

		db = Database(f,"_default")
		f.seek(head)
		db.addr = f.tell()
		bdata = f.read(52)
		db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata)
		db.name = util.trim(db.name)

		while db.next != 0:
			f.seek(db.next)
			db.addr = f.tell()
			bdata = f.read(52)
			db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata)
			db.name = util.trim(db.name)
			if db.name == self.name:
				return True
		return False
 def create_vectorized_representation(self, tweet_data):
     """
     Return the count vectorized and tf-transformed representation of the input tweets.
     """
     sentences = [
         trim(tweet['word_padded_int_arr']) for tweet in tweet_data
     ]
     features = []
     for sentence in sentences:
         representation = np.zeros((40000, ))
         for word_id in sentence:
             representation[word_id] += 1
         features.append(representation)
     features = sparse.csr_matrix(features)
     return features
def create_adversarial_ELMo_representation(domain_specific,
                                           input_file,
                                           output_dir,
                                           parameter_dir=None):
    generated_sentences = pkl.load(open(input_file, 'rb'))
    revised_int_arrs = generated_sentences['generated_int_arr']
    tweet_ids = generated_sentences['tweet_id']
    tweet_dict = {}
    for idx in range(len(tweet_ids)):
        tweet_dict[tweet_ids[idx]] = {}
        tweet_dict[tweet_ids[idx]]['word_int_arr'] = trim(
            revised_int_arrs[idx])
    create_ELMo_representation(tweet_dict,
                               domain_specific=domain_specific,
                               output_dir=output_dir,
                               parameter_dir=parameter_dir)
def get_screen_captures(window, num_frames, from_top, from_bottom):
    screen_width = GetSystemMetrics(SM_CXSCREEN)
    screen_height = GetSystemMetrics(SM_CYSCREEN)
    box = (0, from_top, screen_width, screen_height - from_bottom)

    util.simple_mouse(MOUSEEVENTF_MOVE|MOUSEEVENTF_ABSOLUTE, screen_width / 2,\
                 screen_height / 2)
    util.simple_mouse(MOUSEEVENTF_LEFTDOWN, 0, 0)
    util.simple_mouse(MOUSEEVENTF_LEFTUP, 0, 0)
    time.sleep(0.1)

    SendKeys.SendKeys("{HOME}")

    for i in range(0, num_frames):
        img = ImageGrab.grab(box)
        img = util.trim(img)
        img.save('images/image%d.png' % i)
        SendKeys.SendKeys("{DOWN}")
def create_ELMo_representation(tweet_dicts,
                               domain_specific,
                               output_dir,
                               masked_unigram_id=None,
                               parameter_dir=None):
    """
    Create ELMo representation for all labeled tweets with a certain unigram masked as UNK. The ELMo representations will
    be stored as .npy files for each tweet.
    """
    if domain_specific:
        args = {}
        args['experiment_path'] = parameter_dir
        path = args['experiment_path']
        args = pkl.load(open('../experiments/%s.param' % path, 'rb'))
        args['experiment_path'] = path
        bilm = create_bilm_from_args(args)
    else:
        words = pkl.load(open('../model/word.pkl', 'rb'))
        id2word = dict([(words[w]['id'], w.decode()) for w in words])

    data_dir = '../data/' + output_dir
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)

    for tweet_id in tweet_dicts:
        word_int_arr = trim(tweet_dicts[tweet_id]['word_int_arr'][:50])

        #mask the specified unigram as UNK if masked_unigram_id is specified
        if masked_unigram_id is not None:
            for word_idx in range(len(word_int_arr)):
                if word_int_arr[word_idx] == masked_unigram_id:
                    word_int_arr[word_idx] = 1

        #use DS ELMo/NonDS ELMo code to generate corresponding elmo representation
        if domain_specific:
            elmo_rep = np.array([
                x.detach().cpu().numpy()[0]
                for x in bilm.create_rep([word_int_arr])
            ])
        else:
            sentence = [id2word[idx] for idx in word_int_arr]
            elmo_rep = embed_one_sentence(sentence)
        np.save("%s%d.npy" % (data_dir, tweet_id), elmo_rep)
def extract_matrix_from_image(image, x_metrics, y_metrics):

    x_coords, y_coords = util.get_cell_boundaries(image)

    # Now extract the images of the cells (between the lines), based on the coordinates we just
    # calculated. When we have these extracted images, sum the pixel counts along the horizontal
    # and vertical axes. This will be used to compare with training data to identify the digits
    # in the cells

    pyplot.rcParams['figure.dpi'] = 50
    bw_threshold = 160
    # Make the image monochrome. If the original pixel value > threshold, 1, otherwise 0.
    (thresh, monochrome_image) = cv2.threshold(image, bw_threshold, 1, cv2.THRESH_BINARY_INV)

    puzzle_matrix = []
    for y_coord in y_coords:
        new_matrix_row = []
        for x_coord in x_coords:
            raw_image = monochrome_image[y_coord[0]:y_coord[1], x_coord[0]:x_coord[1]]
            image_height, image_width = raw_image.shape
            image_sum = raw_image.sum()
            image_density = image_sum / (image_width * image_height)
            # If the image density (% of black pixels in the image) is less than a certain threshold
            # we assume the cell is empty and return 0. This is not a test for 0 % since there can be
            # noise in the image. If above the threshold, then determine the number from training data
            if image_density < 0.001:
                number = 0
            else:
                # show_image(raw_image,
                #            title="Y - %s:%s. X - %s:%s" % (y_coord[0], y_coord[1], x_coord[0], x_coord[1]))
                cell_image = util.trim(raw_image)
                if cell_image is None:
                    number = 0
                else:
                    number = get_number_from_image(cell_image, x_metrics, y_metrics)
            print("Number: %s" % number)
            new_matrix_row.append(number)
        puzzle_matrix.append(new_matrix_row)
        print("\n")
    return puzzle_matrix
Exemple #18
0
def make_junto(filename, title, folder='.', threshold=0, ret=False):
	
	if not '_spread' in filename:
		print 'Spreading file...\n'
		filename = util.spread(filename)
	
	print 'Making graph...'
	g = graph.make_graph(filename)
	
	if threshold:
		print '\nTrimming...'
		g['procedures'] = util.trim(g['procedures'], threshold)
	
	print '\nConstructing junto files...'
	graph.to_junto(g, title)
	
	with open(title + '_config', 'w') as f:
		f.write(config % {'title': title, 'folder': folder})
	
	print '\n...Done'
	if ret:
		return g
Exemple #19
0
def compute_all_similarities(folder):
    """
    Compute the similarity between all pairs of wav audio files in the folder

    Keyword arguments:
    folder -- the folder where the audio files are located

    Return:
    a sorted list of tuples containing the similiarity score,
    the name of the first file and the name of the second file
    """

    signatures = {}
    similarities = []

    reference_fs, _ = wavfile.read(folder + '/' + os.listdir(folder)[0])

    # Sign all the audio files in the folder
    for filename in os.listdir(folder):
        fs, data = wavfile.read(folder + '/' + filename)
        if fs != reference_fs:
            print(filename, 'does not have the same sample frequency')
        else:
            track = trim(data)
            signature = sign_track(data, fs)
            signatures[filename] = signature

    # Compute the similarity score between each pair of signed tracks
    for idx, key1 in enumerate(list(signatures.keys())):
        signed1 = signatures[key1]
        for key2 in list(signatures.keys())[idx + 1:]:
            signed2 = signatures[key2]
            similarity = signatures_similarity(signed1, signed2)
            similarities.append(
                (similarity, key1.split('.')[0], key2.split('.')[0]))

    similarities.sort(reverse=True)
    return similarities
    def create_natural_sentences(self, mode, token, tweet_dicts):
        assert mode in ['insert', 'replace']
        token_id = self.dl.token2property[token.encode("utf-8")]['id']
        sentence_outputs = {}
        keys = [
            'original_sentence', 'generated_sentence', 'original_prob',
            'generated_prob', 'original_int_arr', 'generated_int_arr',
            'tweet_id'
        ]
        for key in keys:
            sentence_outputs[key] = []

        for tweet_id in tweet_dicts.keys():
            sentence = tweet_dicts[tweet_id]['word_padded_int_arr']
            num_words = sum([x != 0 for x in sentence])

            if mode == 'insert':
                if num_words == 50:  #already max length, cannot add more words
                    continue
                idx_range = range(num_words + 1)
            else:
                idx_range = range(num_words)

            sentence_outputs['original_int_arr'].append(np.array(sentence))
            original_sentence_unicode = self.dl.convert2unicode(trim(sentence))
            sentence_outputs['original_sentence'].append(
                original_sentence_unicode)
            original_sentence_prob = self.compute_log_prob([trim(sentence)])
            sentence_outputs['original_prob'].append(original_sentence_prob)
            sentence_outputs['tweet_id'].append(tweet_id)

            max_generated_prob = -np.inf
            most_natural_generated_sentence = None

            for pos in idx_range:
                if mode == 'insert':
                    generated_sentence = insert_element(
                        sentence, pos, token_id)
                else:
                    generated_sentence = np.array(sentence)
                    generated_sentence[pos] = token_id

                new_sentence_prob = self.compute_log_prob(
                    [trim(generated_sentence)])
                if new_sentence_prob > max_generated_prob:
                    max_generated_prob = new_sentence_prob
                    most_natural_generated_sentence = generated_sentence

            most_natural_revised_sentence_unicode = self.dl.convert2unicode(
                trim(most_natural_generated_sentence))
            sentence_outputs['generated_sentence'].append(
                most_natural_revised_sentence_unicode)
            sentence_outputs['generated_prob'].append(max_generated_prob)
            sentence_outputs['generated_int_arr'].append(
                np.array(most_natural_generated_sentence))

            if len(sentence_outputs['generated_int_arr']) % 100 == 0:
                print(len(sentence_outputs['generated_int_arr']))
                pkl.dump(
                    sentence_outputs,
                    open(
                        "../adversarial_data/%s_%s_natural_sentence_%s.pkl" %
                        (mode, token, self.dataset), 'wb'))

        #order the records in order of maximum probability increase to minimum probability increase
        prob_diff = np.array(sentence_outputs['generated_prob']) - np.array(
            sentence_outputs['original_prob'])
        sorted_idx = np.argsort(prob_diff)[::-1]
        for key in sentence_outputs.keys():
            sentence_outputs[key] = [
                sentence_outputs[key][idx] for idx in sorted_idx
            ]
        sentence_outputs['prob_change'] = np.array(
            sentence_outputs['generated_prob']) - np.array(
                sentence_outputs['original_prob'])
        pd.DataFrame.from_dict(sentence_outputs).to_csv(
            "../showable/%s_%s_natural_sentence_%s.csv" %
            (mode, token, self.dataset),
            index=False)
        pkl.dump(
            sentence_outputs,
            open(
                "../adversarial_data/%s_%s_natural_sentence_%s.pkl" %
                (mode, token, self.dataset), 'wb'))
Exemple #21
0
            registry.merge_developers()

            # Link developers
            igdb.link_developers()
            steam.link_developers()

            # Merge articles
            registry.merge_articles()

            # Merge videos
            registry.merge_videos()

            # Merge tweets
            registry.merge_tweets()

            trim()

            WS.flush()
            print("[MAIN] Rebuild completed in %d seconds" % (time() - t))
        elif action == '2':
            WS.flush()

        elif action == '3':
            registry.merge_games()
        elif action == '4':
            registry.merge_developers()
        elif action == '5':
            registry.merge_articles()
        elif action == '6':
            registry.merge_videos()
        elif action == '7':
Exemple #22
0
    def lime(self):
        tweet_idx_word_idx2idx, perturbed_tests, idx2sent_length = self.create_perturbation_samples(
            self.tweet_records)

        (idx2max_min_wordidx, first_round_unigram_ranking,
         first_round_max_influences, first_round_all_influences) \
            = self.analyze_perturbations_influence(tweet_idx_word_idx2idx, perturbed_tests,
                                                   idx2sent_length, round=1,
                                                   observe_word_ids=self.unigram_observe_ids)

        self.masked_tweet_records = {}
        for key in self.tweet_records.keys():
            if key != 'word_content_input_elmo' or self.pad_elmo is True:
                self.masked_tweet_records[key] = np.array(
                    self.tweet_records[key])
            else:
                self.masked_tweet_records[key] = self.tweet_records[key]

        for idx in range(len(idx2max_min_wordidx)):
            self.masked_tweet_records['word_content_input'][idx][
                idx2max_min_wordidx[idx][2]] = 1  #mask insignificant unigram

        if self.input_format == 'discrete':
            tweet_idx_word_idx2idx, perturbed_tests, idx2sent_length = self.create_perturbation_samples(
                self.masked_tweet_records)
        else:
            elmo_masked_wordidx = [
                idx2max_min_wordidx[idx][2]
                for idx in range(len(idx2max_min_wordidx))
            ]
            tweet_idx_word_idx2idx, perturbed_tests, idx2sent_length = self.create_perturbation_samples(
                self.masked_tweet_records, elmo_masked_idx=elmo_masked_wordidx)

        observe_word_idx = {}
        for idx in range(len(idx2max_min_wordidx)):
            observe_word_idx[idx] = idx2max_min_wordidx[idx][1]
        second_round_idx2max_min_wordidx, second_round_ranking, second_round_max_influences, second_round_all_influences = \
            self.analyze_perturbations_influence(tweet_idx_word_idx2idx, perturbed_tests, idx2sent_length,
                                                 round=2, observe_word_position_idx=observe_word_idx)

        data = {}
        data['original tweet'] = [
            self.dl.convert2unicode(trim(arr))
            for arr in self.tweet_records['word_content_input']
        ]
        data['masked tweet'] = [
            self.dl.convert2unicode(trim(arr))
            for arr in self.masked_tweet_records['word_content_input']
        ]
        data['first round influences'] = first_round_all_influences
        data['first round max influential unigram'] = [
            self.dl.convert2unicode([
                self.tweet_records['word_content_input'][idx][
                    idx2max_min_wordidx[idx][1]]
            ]) for idx in range(len(idx2sent_length))
        ]
        data['first round most insignificant unigram'] = [
            self.dl.convert2unicode([
                self.tweet_records['word_content_input'][idx][
                    idx2max_min_wordidx[idx][2]]
            ]) for idx in range(len(idx2sent_length))
        ]
        data['first round max influence'] = first_round_max_influences
        data['second round influences'] = second_round_all_influences
        data['second round most influential unigram'] = [
            self.dl.convert2unicode([
                self.tweet_records['word_content_input'][idx][
                    second_round_idx2max_min_wordidx[idx][1]]
            ]) for idx in range(len(idx2sent_length))
        ]
        data['second round max influence'] = second_round_max_influences
        data[
            'first round max influential unigram ranking in second round'] = second_round_ranking
        if self.unigram_observe_ids is not None:
            for unigram_id in self.unigram_observe_ids:
                data['first round unigram %s ranking' %
                     id2word[unigram_id]] = first_round_unigram_ranking[
                         unigram_id]
        pd.DataFrame.from_dict(data).to_csv(self.output_dir, index=False)

        second_round_rank_stats = defaultdict(int)
        for num in second_round_ranking:
            second_round_rank_stats[num] += 1

        #first_round_unigram_ranking uses -1 to indicate that the specified unigram not in the tweet
        #filter out these ranking -1 to get rankings for only those tweets that include the specified unigram
        first_round_unigram_ranking_included = {}
        for unigram_id in self.unigram_observe_ids:
            first_round_unigram_ranking_included[unigram_id] = []
            for i in first_round_unigram_ranking[unigram_id]:
                if i != -1:
                    first_round_unigram_ranking_included[unigram_id].append(i)

        first_round_rank_stats = {}
        for unigram_id in self.unigram_observe_ids:
            stats = defaultdict(int)
            for num in first_round_unigram_ranking_included[unigram_id]:
                stats[num] += 1
            first_round_rank_stats[unigram_id] = stats

        return {
            'unigram_rank_stats': first_round_rank_stats,
            'lime_consistency_stats': second_round_rank_stats,
            'first_round_all_influences': first_round_all_influences,
            'correspondence': self.tweet_id_considered
        }
Exemple #23
0
    def forward(self, X):
        if self.input_format == 'discrete':
            sentence = Variable(torch.LongTensor(trim(
                X['word_content_input'])))
            embedded_val = self.dropout_emb(self.wemb(sentence[:, None]))
        else:
            #permute axis order for convenience of model forward
            #Elmo reorder of dimension axis is performed by the model itself.
            elmo_rep_permuted = np.swapaxes(
                np.swapaxes(X['word_content_input_elmo'], 0, 2), 0, 1)

        if self.input_format == "elmo":
            self.normalized_weight = F.softmax(self.kernel_weight, dim=0)
            if self.elmo_option == 'add_average':
                self.second_normalized_weight = F.softmax(
                    self.second_kernel_weight, dim=0)

            sentence = Variable(torch.FloatTensor(elmo_rep_permuted))

            average_repr = torch.squeeze(sentence @ self.normalized_weight,
                                         dim=2)
            if self.elmo_option == 'add_average':
                all_repr = torch.cat((sentence, average_repr.unsqueeze(dim=2)),
                                     dim=2)
                average_repr = torch.squeeze(
                    all_repr @ self.second_normalized_weight, dim=2)
            average_repr = torch.unsqueeze(average_repr, dim=1)
            embedded_val = self.dropout_elmo(average_repr)

        elif self.input_format == 'both':
            self.normalized_weight = F.softmax(self.kernel_weight, dim=0)
            if self.elmo_option == 'add_average':
                self.second_normalized_weight = F.softmax(
                    self.second_kernel_weight, dim=0)

            #discrete input format features
            sentence = Variable(torch.LongTensor(trim(
                X['word_content_input'])))
            discrete_embedded_val = self.dropout_emb(
                self.wemb(sentence[:, None])).permute(0, 2, 1)

            #elmo input format features
            sentence = Variable(torch.FloatTensor(elmo_rep_permuted))
            sentence = torch.cat((discrete_embedded_val, sentence), dim=2)
            average_repr = torch.squeeze(sentence @ self.normalized_weight,
                                         dim=2)
            if self.elmo_option == 'add_average':
                all_repr = torch.cat((sentence, average_repr.unsqueeze(dim=2)),
                                     dim=2)
                average_repr = torch.squeeze(
                    all_repr @ self.second_normalized_weight, dim=2)
            average_repr = torch.unsqueeze(average_repr, dim=1)
            embedded_val = self.dropout_elmo(average_repr)

        hs, (q, _) = self.encoder(embedded_val.float())

        if self.use_attn:
            hs = self.dropout_lstm(hs)
        else:
            q = self.dropout_lstm(q)
        hs = hs.squeeze_(dim=1)

        # get concatenated representation of hidden states with/without context features
        concatenated_representation = []
        for hidden_state in hs:
            if self.context_feature_before:
                for feature_name in self.context_feature_names:
                    hidden_state = torch.cat(
                        (hidden_state, torch.Tensor(X[feature_name])))
                concatenated_representation.append(hidden_state)
            else:
                concatenated_representation.append(hidden_state)
        concatenated_representation = torch.stack(concatenated_representation)

        if self.verbose:
            print("concatenated hidden states' shape: " +
                  str(concatenated_representation.shape))

        # get concatenated representation of last state of LSTM with/without context featuress
        concatenated_q = q.view(1, -1)[0]
        if self.context_feature_before:
            for feature_name in self.context_feature_names:
                concatenated_q = torch.cat(
                    (concatenated_q, torch.Tensor(X[feature_name])))

        if self.verbose:
            print("concatenated last state's shape: " +
                  str(concatenated_q.shape))

        # calculating attntion
        if not self.use_attn:
            z = concatenated_q
        else:
            A = F.softmax(torch.tanh(self.W_a(concatenated_representation)),
                          dim=0)
            z = (torch.transpose(A, 0, 1) @ concatenated_representation)[0]

        if self.verbose:
            print("feature representation shape: " + str(z.shape))

        if self.context_feature_last:
            for feature_name in self.context_feature_names:
                z = torch.cat((z, torch.Tensor(X[feature_name])))

        if self.verbose:
            print("last tensor representation shape: " + str(z.shape))

        output = torch.sigmoid(self.fc(z)).view(-1, )

        return_dict = {'output': output}
        if self.use_attn:
            return_dict['attn'] = A.view(1, -1)

        return return_dict
Exemple #24
0
__author__ = 'jens'

import util

# Read multi line input and copies all unique lines to the clipboard
# This will also trim all leading and trailing spaces

multiLineInput = util.raw_multi_line_input()
uniqueSorted = util.trim(sorted(util.create_set(multiLineInput, lambda s: s.lower()), key=str.lower))
util.copy_to_clipboard(uniqueSorted)
def main():
    if len(sys.argv) < 3:
        raise Exception(
            "Input parameter 1: folder of folders of training data images. Input parameter 2: folder for JSON output representing image metrics."
        )
    # input fold of images organized into folders named by the number in in the image
    input_folder = sys.argv[1]
    # output folder for json metrics
    output_folder = sys.argv[2]

    image_data = {}
    for i in range(1, 10):
        image_data[i] = {}
        image_data[i]['raw_data'] = []
        image_data[i]['normalized_data'] = []

    # These are to calculate average image width and height
    number_of_items = 0
    width_total = 0
    height_total = 0

    for image_folder_name in listdir(input_folder):
        image_folder = "%s/%s" % (input_folder, image_folder_name)
        if os.path.isdir(image_folder):
            print("Processing: %s..." % image_folder_name)
            # current_number is the number in the images in this folder
            current_number = int(image_folder_name)
            for image_file in listdir(image_folder):
                image_file_name, image_file_extension = os.path.splitext(
                    image_file)
                if image_file_extension == '' or image_file_name[0] == '.':
                    print("Skipping, not an image file.")
                else:
                    image = cv2.imread("%s/%s" % (image_folder, image_file),
                                       cv2.IMREAD_GRAYSCALE)
                    trimmed_image = util.trim(image)
                    # This is summing columns vertically
                    x_counts = trimmed_image.sum(axis=0)
                    # This is summing rows horizontally
                    y_counts = trimmed_image.sum(axis=1)
                    image_size = len(x_counts) * len(y_counts)
                    x_percentage = x_counts / image_size * 100
                    y_percentage = y_counts / image_size * 100

                    new_entry = {
                        'file_name': image_file,
                        'x': x_percentage.tolist(),
                        'y': y_percentage.tolist()
                    }
                    image_data[current_number]['raw_data'].append(new_entry)

                    number_of_items += 1
                    width_total += len(x_counts)
                    height_total += len(y_counts)

            print("Finished folder: %s" % image_folder)
    normalized_width = int(width_total / number_of_items)
    normalized_height = int(height_total / number_of_items)
    image_data['normalized_size'] = {
        'x': normalized_width,
        'y': normalized_height
    }
    output_json(image_data, "%s/image_data.json" % output_folder)
    normalize_image_metrics(image_data)

    output_json(image_data, "%s/image_metrics.json" % output_folder)
Exemple #26
0
__author__ = 'jens'

import util

# Read multi line input and copies all unique lines to the clipboard
# This will also trim all leading and trailing spaces

multiLineInput = util.raw_multi_line_input()
uniqueSorted = util.trim(
    sorted(util.create_set(multiLineInput, lambda s: s.lower()),
           key=str.lower))
util.copy_to_clipboard(uniqueSorted)
elif findMinAndMax([7, 1, 3, 9, 5]) != (1, 9):
    print('测试失败!4')
else:
    print('测试成功!')

print('******************')
for i, value in enumerate(['a', 'b', 'c']):
    print(i, value)

d = {'a': 1, 'b': 2, 'c': 3}
for key in d:
    print(key)

print('******************')
# 测试:
if trim('hello  ') != 'hello':
    print('1测试失败!', trim('hello  '))
elif trim('  hello') != 'hello':
    print('2测试失败!', trim('  hello'))
elif trim('  hello  ') != 'hello':
    print('3测试失败!', trim('  hello  '))
elif trim('  hello  world  ') != 'hello  world':
    print('4测试失败!', trim('  hello  world  '))
elif trim('') != '':
    print('5测试失败!', trim(''))
elif trim('    ') != '':
    print('6测试失败!', trim('    '))
else:
    print('测试成功!')

print('******************')
    def evaluate_model_prediction(self,
                                  token,
                                  model_id,
                                  run_idx,
                                  fold_idx,
                                  class_idx,
                                  mode='binary',
                                  top_num=800):
        generated_sentences = pkl.load(
            open(
                "../adversarial_data/insert_%s_natural_sentence_labeled.pkl" %
                token, 'rb'))
        original_int_arrs = generated_sentences['original_int_arr'][:top_num]
        revised_int_arrs = generated_sentences['generated_int_arr'][:top_num]
        tweet_ids = generated_sentences['tweet_id'][:top_num]

        all_tweets = self.dl.all_data()
        original_tweets = []
        generated_tweets = []

        tweetid2tweetidx = {}
        for idx in range(len(all_tweets)):
            tweetid2tweetidx[all_tweets[idx]['tweet_id']] = idx

        for idx in range(len(original_int_arrs)):
            tweet = all_tweets[tweetid2tweetidx[tweet_ids[idx]]]
            original_tweets.append(tweet)
            generated_tweet = deepcopy(tweet)
            assert np.all(generated_tweet['word_padded_int_arr'] ==
                          original_int_arrs[idx])
            generated_tweet['word_padded_int_arr'] = revised_int_arrs[idx]
            generated_tweet['word_int_arr'] = trim(
                generated_tweet['word_padded_int_arr'])
            generated_tweets.append(generated_tweet)

        generated_elmo_dir = None
        original_elmo_dir = None
        if model_id in (3, 4, 6, 7):  #DS ELMo
            generated_elmo_dir = "../adversarial_data/DS_ELMo_adversarial_insert_%s" % token
            original_elmo_dir = "../data/DS_ELMo_rep"
        if model_id == 5:  #NonDS ELMo
            generated_elmo_dir = "../adversarial_data/NonDS_ELMo_adversarial_insert_%s" % token
            original_elmo_dir = "../data/NonDS_ELMo_rep"

        load_model_tweet_dicts(model_id,
                               generated_tweets,
                               elmo_dir=generated_elmo_dir)
        generated_tweet_X = pkl.load(
            open("../data/adversarial_tweet_X.pkl", 'rb'))

        load_model_tweet_dicts(model_id,
                               original_tweets,
                               elmo_dir=original_elmo_dir)
        original_tweet_X = pkl.load(
            open("../data/adversarial_tweet_X.pkl", 'rb'))

        model = load_model(model_id, run_idx, fold_idx, class_idx)
        original_predictions = model.predict(original_tweet_X)
        generated_predictions = model.predict(generated_tweet_X)

        assert mode in ['score', 'binary']
        if mode == 'score':  # analyze prediction numerical score change
            return original_predictions, generated_predictions

        else:  # analyze label flipping
            threshold = get_model_info(num_runs=5,
                                       num_folds=5,
                                       num_models=model_id)['thresholds'][(
                                           model_id,
                                           run_idx)][class_idx][fold_idx]
            original_pred_labels = [
                1 if x >= threshold else 0 for x in original_predictions
            ]
            generated_pred_labels = [
                1 if x >= threshold else 0 for x in generated_predictions
            ]
            new_positive_tweet_ids = []
            new_negative_tweet_ids = []

            for idx in range(len(original_predictions)):
                if original_pred_labels[idx] == 0 and generated_pred_labels[
                        idx] == 1:
                    new_positive_tweet_ids.append(
                        original_tweets[idx]['tweet_id'])
                if original_pred_labels[idx] == 1 and generated_pred_labels[
                        idx] == 0:
                    new_negative_tweet_ids.append(
                        original_tweets[idx]['tweet_id'])
            return len(new_positive_tweet_ids)
    def sanity_check(self):
        # For each two adjacent tweets, switch the word on every positions and see if both tweets' log probability
        # decrease most of the time
        tweet_ids = list(self.dl.data['data'].keys())
        count_prob_decrease = 0  # number of times the revised sentence has lower probability than original sentence
        count_prob_increase = 0  # number of times the revised sentence has higher probability than original sentence
        prob_increase_samples = {}
        prob_increase_samples['original'] = []
        prob_increase_samples['revised'] = []
        prob_increase_samples['original score'] = []
        prob_increase_samples['revised score'] = []

        for idx in range(len(tweet_ids) - 1):
            tweet_id1 = tweet_ids[idx]
            tweet_id2 = tweet_ids[idx + 1]

            sentence1 = trim(
                self.dl.data['data'][tweet_id1]['word_padded_int_arr'])
            sentence2 = trim(
                self.dl.data['data'][tweet_id2]['word_padded_int_arr'])

            log_prob_sentence1 = self.compute_log_prob([sentence1])
            log_prob_sentence2 = self.compute_log_prob([sentence2])
            for word_idx in range(min(len(sentence1), len(sentence2))):
                # swap the two sentences word on this position
                sentence1[word_idx], sentence2[word_idx] = sentence2[
                    word_idx], sentence1[word_idx]
                log_prob_revised_sentence1 = self.compute_log_prob([sentence1])
                log_prob_revised_sentence2 = self.compute_log_prob([sentence2])
                if log_prob_revised_sentence1 <= log_prob_sentence1:
                    count_prob_decrease += 1
                else:
                    count_prob_increase += 1
                    prob_increase_samples['revised'].append(
                        self.dl.convert2unicode(sentence1))
                    prob_increase_samples['revised score'].append(
                        log_prob_revised_sentence1)
                    prob_increase_samples['original score'].append(
                        log_prob_sentence1)

                if log_prob_revised_sentence2 <= log_prob_sentence2:
                    count_prob_decrease += 1
                else:
                    count_prob_increase += 1
                    prob_increase_samples['revised'].append(
                        self.dl.convert2unicode(sentence2))
                    prob_increase_samples['revised score'].append(
                        log_prob_revised_sentence2)
                    prob_increase_samples['original score'].append(
                        log_prob_sentence2)

                # recover the original sentence
                sentence1[word_idx], sentence2[word_idx] = sentence2[
                    word_idx], sentence1[word_idx]
                if log_prob_revised_sentence1 > log_prob_sentence1:
                    prob_increase_samples['original'].append(
                        self.dl.convert2unicode(sentence1))
                if log_prob_revised_sentence2 > log_prob_sentence2:
                    prob_increase_samples['original'].append(
                        self.dl.convert2unicode(sentence2))

            if idx % 10 == 0:
                print("increase: ", count_prob_decrease)
                print("decrease: ", count_prob_increase)
            if idx > 100:
                break
        print("Probability decrease: ", count_prob_decrease)
        print("Probability increase: ", count_prob_increase)
        pd.DataFrame.from_dict(prob_increase_samples).to_csv(
            "../showable/ELMo_sanity_check.csv", index=False)
Exemple #30
0
print(fixer.candidates("incoerrct"))  # Output: incorrect

# Fix a string, results print on console
fixer.fix("Thsi is a sentnce taht full of mistakes")

# Initialize new fixer
# rt1.txt and rt2.txt are files that contain some random texts for testing
# Some words are spelled incorrectly in order to test the fixer
f = fixer.Fixer(
    dir,  # The root dir
    recursive=True,  # Recursively check all the files
    fname=["rt1", "rt2"])  # Only certain files will be checked
# Start fixing files, results print on console
f.fix()

import util
# print all files under resources folder
print(util.find(dir))
# print files with given filenames
print(util.find(dir, fname=["rt1"]))
# print files with given extensions
print(util.find(dir, suffix=[".txt"]))

print(util.trim("exam?ple!"))  # output: example
print(util.words("Separate words from sentence"))
# Output: ['separate', 'words', 'from', 'sentence']

# Find all the comments in a py file, here use corrector.py as an example
examplepy = util.find(dir, fname=["corrector"])[0]
print(util.find_comment_py(examplepy))