Esempi in Python per SequenceMatcher, esempi in Python per edit_distance.SequenceMatcher

Esempio n. 1

0

Mostra file

    def query(self, ltoks, mingram, maxngram, nbest, sortByEDist, idx_tst):
        if idx_tst >= 0: #single sentence
            print("[{}]\t{}".format(idx_tst,' '.join(ltoks[0])))

        query_idx = []
        for toks in ltoks:
            query_idx.append(self.convert(toks))

        subphrases = self.getSubPhrase(query_idx, mingram, maxngram)
        counts = defaultdict(int)
        for subphrase in subphrases:
            result = self.getSentenceIds(subphrase)
            for idx_trn in result:
                counts[idx_trn] += 1

        if not sortByEDist: ### sort by counts
            sorted_counts = sorted(counts.items(), key = lambda x:x[1], reverse=True)
            for idx_trn, ngrams_count in sorted_counts[:nbest]:
                entry = []
                entry.append("{}".format(ngrams_count)) ### ngrams_counts
                trn_vec_idx = self.corpus[self.sentences[idx_trn]:self.sentences[idx_trn+1]-1]
                if idx_tst >= 0: #single sentence
                    sm = edit_distance.SequenceMatcher(a=query_idx[0], b=trn_vec_idx)
                    entry.append("{:.4f}".format(sm.ratio())) ### edit distance
                    entry.append("{}".format(idx_tst)) ### index tst
                entry.append("{}".format(idx_trn)) ### index trn
                entry.append(' '.join(self.convert(trn_vec_idx))) ### trn
                print('\t'.join(entry))

        else: ### sort by edit_distance
            if idx_tst == -1: 
                sys.stderr.write('error: -testSet cannot be used with -sortByEDist')
                sys.exit()

            edist = {} #defaultdict(float)
            mbest = nbest*10
            sorted_counts = sorted(counts.items(), key = lambda x:x[1], reverse=True)
            for idx_trn, ngrams_count in sorted_counts[:mbest]:
                trn_vec_idx = self.corpus[self.sentences[idx_trn]:self.sentences[idx_trn+1]-1]
                sm = edit_distance.SequenceMatcher(a=query_idx[0], b=trn_vec_idx)
                edist[idx_trn] = sm.ratio()

            sorted_edist = sorted(edist.items(), key = lambda x:x[1], reverse=True)
            for idx_trn, edist in sorted_edist[:nbest]:
                entry = []
                entry.append("{}".format(counts[idx_trn])) ### ngrams_counts
                trn_vec_idx = self.corpus[self.sentences[idx_trn]:self.sentences[idx_trn+1]-1]
                entry.append("{:.4f}".format(edist)) ### edit distance
                entry.append("{}".format(idx_tst)) ### index tst
                entry.append("{}".format(idx_trn)) ### index trn
                entry.append(' '.join(self.convert(trn_vec_idx))) ### trn
                print('\t'.join(entry))

Esempio n. 2

0

Mostra file

  def evaluate_all(self):
    num_samples = len(self.all_recognition_text)

    def _normalize_text(text):
      text = ''.join(filter(lambda x: x in (string.digits + string.ascii_letters), text))
      return text.lower()

    num_correct = 0
    num_incorrect = 0
    total_edit_distance = 0
    incorrect_pairs = []
    for i in range(num_samples):
      recogition = _normalize_text(self.all_recognition_text[i])
      groundtruth = _normalize_text(self.all_groundtruth_text[i])
      if recogition == groundtruth:
        num_correct += 1
      else:
        num_incorrect += 1
        incorrect_pairs.append((recogition, groundtruth))
      sm = edit_distance.SequenceMatcher(a=recogition, b=groundtruth)
      normalized_ed = sm.distance() / len(groundtruth)
      total_edit_distance += normalized_ed
    num_print = min(len(incorrect_pairs), 100)
    # print('*** Groundtruth => Prediction ***')
    # for i in range(num_print):
    #   recogition, groundtruth = incorrect_pairs[i]
    #   print('{} => {}'.format(groundtruth, recogition))
    # print('**********************************')
    case_insensitive_accuracy = 1.0 * num_correct / (num_correct + num_incorrect)

    metrics = {
      'WordAccuracy': case_insensitive_accuracy,
      'TotalEditDistance': total_edit_distance,
    }
    return metrics

Esempio n. 3

0

Mostra file

File: run_data_preparation.py Progetto: huiyangzhou/C2C-DA

 def target_sentence_sampling(s_cluster, candidates, edit_distance_matrix,
                              sent2idx_dict):
     longest = -1
     longest_p = []
     longest_p_nearest_p = []
     for point in candidates:
         idx_p = sent2idx_dict[' '.join(point)]
         min_dist = 999999
         min_p = []
         for s_point in s_cluster:
             idx_sp = sent2idx_dict[' '.join(s_point)]
             if edit_distance_matrix[idx_p][idx_sp] >= 0:
                 dist = edit_distance_matrix[idx_p][idx_sp]
             else:
                 sm = edit_distance.SequenceMatcher(a=point, b=s_point)
                 dist = sm.distance()
                 edit_distance_matrix[idx_p][idx_sp] = dist
                 edit_distance_matrix[idx_sp][idx_p] = dist
             if dist < min_dist:
                 min_p = s_point
             min_dist = min(dist, min_dist)
         if min_dist > longest:
             longest = min_dist
             longest_p = point
             longest_p_nearest_p = min_p
     return longest_p, longest, longest_p_nearest_p

Esempio n. 4

0

Mostra file

File: run_data_preparation.py Progetto: huiyangzhou/C2C-DA

    def update_centers(data_set, assignments, edit_distance_matrix,
                       sent2idx_dict):
        new_means = {}
        centers = []
        for assignment, point in zip(assignments, data_set):
            if assignment not in new_means:
                new_means[assignment] = [point]
            else:
                new_means[assignment].append(point)

        for center in new_means:
            points = new_means[center]
            shortest = 999999  # positive infinity
            shortest_p = []
            for i, point in enumerate(points):
                total_dist = 0
                for j, point2 in enumerate(points):
                    idx_p = sent2idx_dict[' '.join(point)]
                    idx_p2 = sent2idx_dict[' '.join(point2)]
                    if edit_distance_matrix[idx_p][idx_p2] >= 0:
                        dist = edit_distance_matrix[idx_p][idx_p2]
                    else:
                        sm = edit_distance.SequenceMatcher(a=point, b=point2)
                        dist = sm.distance()
                        edit_distance_matrix[idx_p][idx_p2] = dist
                        edit_distance_matrix[idx_p2][idx_p] = dist
                    total_dist += dist
                if total_dist < shortest:
                    shortest = total_dist
                    shortest_p = point
            centers.append(shortest_p)
        return centers

Esempio n. 5

0

Mostra file

def edrs(a, b):
    #Edit Distance with Real sequences
    a = a.tolist()
    b = b.tolist()

    sm = edit_distance.SequenceMatcher(a, b)
    return sm.distance()

Esempio n. 6

0

Mostra file

def get_similar_words(input_word, num_of_words):
    #print('get_similar_words')
    global vectors, ids

    p = np.array([nlp.vocab[input_word].vector])
    closest_index = distance.cdist(p, vectors)
    #print('closest_index',closest_index)
    output_list = []
    closest_indexes = closest_index.argsort()
    #print('closest_indexes',closest_indexes)
    closest_indexes = np.squeeze(closest_indexes)
    closest_indexes = closest_indexes[0:105]
    for i in closest_indexes:
        word_id = ids[i]
        output_word = nlp.vocab[word_id]
        output_word = output_word.text.lower()
        #print('in',type(input_word))
        #print('out',type(output_word))
        sm = edit_distance.SequenceMatcher(input_word.lower(),
                                           output_word.lower())
        levin_dist = sm.distance()
        if ((output_word.lower() != input_word.lower()) and (levin_dist > 2)):
            output_word = output_word
            output_list.append(output_word)
            if len(output_list) >= num_of_words:
                return output_list
    return output_list

Esempio n. 7

0

Mostra file

File: playlist_builder.py Progetto: kasra-sheik/Music-Festival-Playlist-Builder

def get_top_songs(artist):
    top_songs = []

    #might want to have this number be dynamic based on attribute ie. set time/popularity?
    song_count = 5

    artist_search = spotify.search(q='artist:' + artist, type='artist')

    if len(artist_search['artists']['items']) == 0:
        return []

    result_name = artist_search['artists']['items'][0]['name']

    #comparing edit distance might be useful
    if (result_name.lower() == artist.lower() or edit_distance.SequenceMatcher(
            artist.lower(), result_name.lower()).ratio() >= 66):

        artist_id = artist_search['artists']['items'][0]['id']
        top_tracks = spotify.artist_top_tracks(artist_id)['tracks']

        for track in top_tracks:
            top_songs.append(track['id'])

            if len(top_songs) >= song_count:
                break
    else:
        print("error could not find artist spotify name {} scraped name {}".
              format(result_name, artist))
    return top_songs

Esempio n. 8

0

Mostra file

def find_teachers(subject, target):
    with open(filename, 'r') as f:
        subjects_fo = json.load(f)
    teachers = list(subjects_fo[subject].keys())
    target = target.lower()
    teachers.sort(key=lambda t: edit_distance.SequenceMatcher(
        a=target, b=t.lower()).distance())
    return teachers

Esempio n. 9

0

Mostra file

 def similarity(self, predicted: List[int], targets: torch.LongTensor,
                target_mask: torch.LongTensor) -> float:
     # remove padding ones
     actual_len = target_mask.sum()
     targets_trimmed = targets[:actual_len]
     targets_trimmed = list(targets_trimmed.cpu().data.numpy())
     sm = edit_distance.SequenceMatcher(a=predicted, b=targets_trimmed)
     # get the edit distance similarity between two lists
     return sm.ratio()

Esempio n. 10

0

Mostra file

    def __call__(self, l1, l2):
        if (len(l1) == 1 and len(l1[0]) == 0) or (len(l2) == 1 and len(l2[0]) == 0):
            return 0.0, [''], ['']

        ### initially all discarded
        L1 = [self.u] * len(l1)
        L2 = [self.u] * len(l2)

        if self.lc: ### use .lower() or .casefold()
            sm = edit_distance.SequenceMatcher(a=[s.casefold() for s in l1], b=[s.casefold() for s in l2], action_function=edit_distance.highest_match_action)
        else:
            sm = edit_distance.SequenceMatcher(a=l1, b=l2, action_function=edit_distance.highest_match_action)

        for (code, b1, e1, b2, e2) in sm.get_opcodes():
            if code == 'equal': ### keep words
                L1[b1] = l1[b1]
                L2[b2] = l2[b2]
        return sm.ratio(), L1, L2

Esempio n. 11

0

Mostra file

File: app.py Progetto: toxtli/deepiracy

def get_sequence_matches(sequence_1, sequence_2):
    if sequence_1 and sequence_2:
        sm = edit_distance.SequenceMatcher(a=sequence_1, b=sequence_2)
        sm.get_opcodes()
        sm.ratio()
        sm.get_matching_blocks()
        distance = sm.distance()
        num_matches = sm.matches()
        return num_matches
    else:
        return 0

Esempio n. 12

0

Mostra file

File: sequence_alignment.py Progetto: arindam8/FYProject

def calc_global_alignment(file_f,file_s):
    opcodes_f = getOpcodeForFile(file_f)
    opcodes_s = getOpcodeForFile(file_s)
    # alignments = pairwise2.align.globalms(opcodes_f,opcodes_s,2,-1,-0.5,-0.1, gap_char=["-"],one_alignment_only=True)
    #
    # for a in alignments:
    #     score = a[2]
    #     print (score)
    sm = edit_distance.SequenceMatcher(a=opcodes_f, b=opcodes_s)

    return (sm.ratio())

Esempio n. 13

0

Mostra file

File: cal_pws.py Progetto: myd1/OB-WSPES

 def cal_edit(self, data0, data1):
     edit_result = []
     for query in data0.keys():
         if query in data1.keys():
             list_0 = data0[query]
             list_1 = data1[query]
             sm = edit_distance.SequenceMatcher(list_0, list_1)
             edit_score = sm.distance()
             # print(edit_score)
             edit_result.append(edit_score)
     return edit_result

Esempio n. 14

0

Mostra file

File: train.py Progetto: Archives-of-Szkieletor37/satos-grad-thesis

	def calcurate_edit_distance():
		ratio = 0.0
		ds = random.sample(test_data, 100)
		for fr,_,to in ds:
			result = model.translate([model.xp.array(fr)])[0]
			result = tree2normalizedsentense(result)
			to = tree2normalizedsentense(to)
			#print(to,result)
			ratio += 1.0 - edit_distance.SequenceMatcher(to,result).ratio()
		#print(ratio)
		ratio /= len(ds)
		return ratio

Esempio n. 15

0

Mostra file

File: dataset_generation.py Progetto: ivonajdenkoska/deep-learning-for-med-terms

def get_neg_candidates_edit_distance(candidate_term, terms):
    closest_term_dist = 5
    closest_term = ""

    for term in terms:
        sequence_matcher = edit_distance.SequenceMatcher(a=candidate_term, b=term)
        edit_dist = sequence_matcher.distance()
        if edit_dist <= closest_term_dist and candidate_term != term:
            closest_term = term
            break

    return closest_term

Esempio n. 16

0

Mostra file

def parallel_decoding(data):
    posteriors, true_length, text, hmm = data
    posteriors = posteriors[:true_length]

    best_path, pstar = hmm.viterbi_decode(posteriors)
    word_seq = hmm.getTranscription(best_path)
    ref_seq = text.split(' ')

    # edit distance
    res = edit_distance.SequenceMatcher(a=ref_seq, b=word_seq)

    return word_seq, best_path, pstar, res.distance()

Esempio n. 17

0

Mostra file

def edit_dist_with_repl_similarity(tx_numb, rx_numb, word2numb):
    """ This function aligns two seq according to edit distance and then
     subtracts the similarity measure between replaced words from the edit distance.
     Wu Parmer similarity measure is used for this task.

    Args:
        tx_numb: the number representation of the tx sentence
        rx_numb: the number representation fo the rx sentence
        word2numb: word to numb object

    Returns:
        dist_measur: returns the distance measure
    """
    # get the word representation
    tx_txt = word2numb.convert_n2w(tx_numb)
    rx_txt = word2numb.convert_n2w(rx_numb)

    ed_aligned = edit_distance.SequenceMatcher(a=tx_numb, b=rx_numb)

    dist_measur = ed_aligned.distance()  # this is the edit distance

    indx_tx = 0
    indx_rx = 0
    # go through insertions and deletions and replacements in the alignment
    for i, op in enumerate(ed_aligned.get_opcodes()):
        # print(op)
        if op[0] == 'equal':
            indx_tx += 1
            indx_rx += 1
            continue
        elif op[0] == 'replace':  # if replacement discount similarity
            tx_syn = get_synset(pos_tag([tx_txt[indx_tx]]))
            rx_syn = get_synset(pos_tag([rx_txt[indx_rx]]))
            sim = 0
            if (tx_syn is not None) and (rx_syn is not None):
                sim = tx_syn.wup_similarity(
                    rx_syn)  # use Wu Palmer similarity measure
                if sim is None:
                    sim = 0
            dist_measur -= sim
            indx_tx += 1
            indx_rx += 1
        elif op[0] == 'delete':
            indx_tx += 1
        elif op[0] == 'insert':
            indx_rx += 1
        else:
            print("****************** ERROR ***************")
            break

    return dist_measur

Esempio n. 18

0

Mostra file

File: basic.py Progetto: edisonleolhl/CodeCraft-2019

def cal_distance(label_list, pre_list):
    y = ed.SequenceMatcher(a = label_list, b = pre_list)
    yy = y.get_opcodes()
    insert = 0
    delete = 0
    replace = 0
    for item in yy:
        if item[0] == 'insert':
            insert += item[-1]-item[-2]
        if item[0] == 'delete':
            delete += item[2]-item[1]
        if item[0] == 'replace':
            replace += item[-1]-item[-2]  
    distance = insert+delete+replace     
    return distance, (delete, replace, insert)

Esempio n. 19

0

Mostra file

def compute_pdists_in_docs(docs, length_ratio_threshold=None):
    pdists = []
    n = len(docs)
    for j in range(n - 1):
        for i in range(j + 1, n):
            len_i = len(docs[i])
            len_j = len(docs[j])
            len_ratio = min(len_i, len_j) / max(len_i, len_j)
            if (length_ratio_threshold is not None
                    and len_ratio < length_ratio_threshold):
                ratio = 1.0
            else:
                sm = edit_distance.SequenceMatcher(a=docs[j], b=docs[i])
                ratio = sm.distance() * 2 / (len_j + len_i)
            pdists.append(ratio)
    return pdists

Esempio n. 20

0

Mostra file

def validation(model, val_fn, decode_fn, datagen, mb_size=64):
    """ Validation routine for speech-models
    Params:
        model (keras.model): Constructed keras model
        val_fn (theano.function): A theano function that calculates the cost
            over a validation set
        datagen (DataGenerator)
        mb_size (int): Size of each minibatch
    Returns:
        val_cost (float): Average validation cost over the whole validation set
    """
    avg_cost = 0.0
    avg_acc = 0.0
    i = 0
    for batch in datagen.iterate_validation(mb_size):
        inputs = batch['x']
        labels = batch['y']
        input_lengths = batch['input_lengths']
        label_lengths = batch['label_lengths']
        texts = batch['texts']
        # print('labels:'+str(labels))
        # Due to convolution, the number of timesteps of the output
        # is different from the input length. Calculate the resulting
        # timesteps
        ctc_input_lens = ctc_input_length(model, input_lengths)
        # print('ctc_input_lens_pre:'+str(ctc_input_lens))
        prediction, ctc_cost = val_fn(
            [inputs, ctc_input_lens, labels, label_lengths, True])
        # print(labels)
        # prediction = np.swapaxes(prediction, 0, 1)
        predict_str = argmax_decode(prediction, decode_fn, ctc_input_lens)

        # print('predict_str:'+str(predict_str))
        avg_cost += ctc_cost.mean()
        print('predict_str:' + str(predict_str))
        print('texts:' + str(texts))
        acc_sum = 0
        for index, text in enumerate(texts):
            sm = edit_distance.SequenceMatcher(a=text, b=predict_str[index])
            acc = 1.0 - sm.distance() / len(text)
            acc_sum = acc_sum + acc
        avg_acc += acc_sum * 1.0 / (index + 1)
        i += 1
    if i == 0:
        return 0.0, 0.0
    return avg_cost / i, avg_acc / i

Esempio n. 21

0

Mostra file

def correctErrors(newStr, dic):
    for i in range(0, len(newStr), 4):
        str1 = newStr[i:i + 4]
        error = 1
        for j in range(8):
            if str1 == dic[str(j)]:
                error = 0
                break
        if error == 1:
            edit_dist = []
            for k in range(8):
                edit_dist.append(
                    edit_distance.SequenceMatcher(a=str1,
                                                  b=dic[str(k)]).distance())
            min_index = edit_dist.index(min(edit_dist))
            newStr = newStr[:i] + dic[str(min_index)] + newStr[i + 4:]
    return newStr

Esempio n. 22

0

Mostra file

File: test.py Progetto: taorui-plus/deepspeech2_tensorflow

def test(model, test_fn, decode_fn, datagen, mb_size=16, conv_context=11,
         conv_border_mode='valid', conv_stride=2):
    """ Testing routine for speech-models
    Params:
        model (keras.model): Constructed keras model
        test_fn (theano.function): A theano function that calculates the cost
            over a test set
        datagen (DataGenerator)
        mb_size (int): Size of each minibatch
        conv_context (int): Convolution context
        conv_border_mode (str): Convolution border mode
        conv_stride (int): Convolution stride
    Returns:
        test_cost (float): Average test cost over the whole test set
    """
    avg_cost = 0.0
    i = 0
    acc_list = []
    for batch in datagen.iterate_test(mb_size):
        inputs = batch['x']
        labels = batch['y']
        input_lengths = batch['input_lengths']
        label_lengths = batch['label_lengths']
        ground_truth = batch['texts']
        # Due to convolution, the number of timesteps of the output
        # is different from the input length. Calculate the resulting
        # timesteps
        # output_lengths = [conv_output_length(l, conv_context,
        #                                      conv_border_mode, conv_stride)
        #                   for l in input_lengths]
        ctc_input_lens = ctc_input_length(model, input_lengths)
        prediction, ctc_cost = test_fn([inputs, ctc_input_lens, labels,
                                        label_lengths, True])
        # predictions = np.swapaxes(predictions, 0, 1)
        prediction_str = argmax_decode(prediction, decode_fn, ctc_input_lens)
        for i, prediction in enumerate(prediction_str):
            truth = ground_truth[i]
            sm = edit_distance.SequenceMatcher(a=truth,b=prediction)
            acc = 1 - sm.distance()/len(truth) 
            acc_list.append(acc)
            print("Truth: {}, Prediction: {}, acc: {}".format(truth, prediction, acc))
    print(acc_list) 
    print('avg_acc:'+str(np.array(acc_list).mean()))
    return ''

Esempio n. 23

0

Mostra file

    def EDAlignment(self):
        self.tst2src = [-1] * len(
            self.tst
        )  ### this vector points to the corresponsing src word for each tst word (or -1 if there is no correspondence)
        self.src2tst = [-1] * len(self.src)
        sm = edit_distance.SequenceMatcher(self.tst, self.src)
        blocks = sm.get_matching_blocks()
        for block in blocks:
            self.tst2src[block[0]] = block[1]
            self.src2tst[block[1]] = block[0]

        if self.verbose:
            for x in range(len(self.tst2src)):
                if self.tst2src[x] != -1:
                    print('TST2SRC [{}:{} {}:{}]'.format(
                        x, self.tst[x], self.tst2src[x],
                        self.src[self.tst2src[x]]))

        return

Esempio n. 24

0

Mostra file

File: run_data_preparation.py Progetto: huiyangzhou/C2C-DA

 def assign_points(data_points, centers, edit_distance_matrix,
                   sent2idx_dict):
     assignments = []
     for point in data_points:
         shortest = 999999  # positive infinity
         shortest_index = 0
         for i, center in enumerate(centers):
             idx_p = sent2idx_dict[' '.join(point)]
             idx_c = sent2idx_dict[' '.join(center)]
             if edit_distance_matrix[idx_p][idx_c] >= 0:
                 dist = edit_distance_matrix[idx_p][idx_c]
             else:
                 sm = edit_distance.SequenceMatcher(a=point, b=center)
                 dist = sm.distance()
                 edit_distance_matrix[idx_p][idx_c] = dist
                 edit_distance_matrix[idx_c][idx_p] = dist
             if dist < shortest:
                 shortest = dist
                 shortest_index = i
         assignments.append(shortest_index)
     return assignments

Esempio n. 25

0

Mostra file

    def test(self, dataset):
        # init stats
        E, N = 0, 0
        with tqdm(total=len(dataset),
                  bar_format='    {l_bar}{bar:30}{r_bar}') as pbar:
            for x, y, text, y_true_length in dataset.generator():
                posteriors = self.features_to_posteriors(x)
                # run viterbi to get recognized words
                best_path, pstar = self.hmm.viterbi_decode(posteriors)
                word_seq = self.hmm.getTranscription(best_path)
                # get original text
                ref_seq = text.split(' ')

                # edit distance
                res = edit_distance.SequenceMatcher(a=ref_seq, b=word_seq)
                E += res.distance()
                N += len(ref_seq)
                accuracy = (N - E) / N
                # update progress bar
                pbar.set_description(f'Test acc. {accuracy:.6f}')
                pbar.update(1)
        return accuracy

Esempio n. 26

0

Mostra file

File: measures.py Progetto: potipot/jiwer

def _get_operation_counts(
    source_string: str, destination_string: str
) -> Tuple[int, int, int, int]:
    """
    Check how many edit operations (delete, insert, replace) are required to
    transform the source string into the destination string. The number of hits
    can be given by subtracting the number of deletes and substitutions from the
    total length of the source string.

    :param source_string: the source string to transform into the destination string
    :param destination_string: the destination to transform the source string into
    :return: a tuple of #hits, #substitutions, #deletions, #insertions
    """

    #editops = Levenshtein.editops(source_string, destination_string)
    editops = edit_distance.SequenceMatcher(a=source_string, b=destination_string).get_opcodes()

    substitutions = sum(1 if op[0] == "replace" else 0 for op in editops)
    deletions = sum(1 if op[0] == "delete" else 0 for op in editops)
    insertions = sum(1 if op[0] == "insert" else 0 for op in editops)
    hits = len(source_string) - (substitutions + deletions)

    return hits, substitutions, deletions, insertions

Esempio n. 27

0

Mostra file

File: parse_digit_replacement_results.py Progetto: 9yte/VenoMave

        adv_digit = attack_dir.parent.parent.parent.name[-1]
        target_filename = attack_dir.parent.parent.parent.parent.name
        target_speaker = '-'.join(target_filename.split("-")[:-1])
        original_digit = target_filename[-2]

        E, N = 0, 0
        for test_filename, r in attack_res['test_res'].items():
            pred_word_seq, label_word_seq = r['pred_word_seq'], r['label_word_seq']
            label_word_seq = ' '.join([str(d) for d in tools.str_to_digits(label_word_seq.split())])
            pred_word_seq = ' '.join([str(d) for d in tools.str_to_digits(pred_word_seq.split())])

            if test_filename.startswith(target_speaker):
                continue
            else:
                res = edit_distance.SequenceMatcher(a=label_word_seq, b=pred_word_seq)
                E += res.distance()
                N += len(label_word_seq.split(" "))

        speaker_E, speaker_N = 0, 0
        speaker_target_file_num = 0
        speaker_succeeded_targets = []
        for test_filename, r in attack_res['speaker_res'].items():
            pred_word_seq, label_word_seq = r['pred_word_seq'], r['label_word_seq']
            label_word_seq = ' '.join([str(d) for d in tools.str_to_digits(label_word_seq.split())])
            pred_word_seq = ' '.join([str(d) for d in tools.str_to_digits(pred_word_seq.split())])

            if test_filename == target_filename:
                continue

            if test_filename.startswith(target_speaker):

Esempio n. 28

0

Mostra file

output = []
net_accuracy = []
edit_distance_output = []
for j in range(5):
    net_score = 0
    total_words = 0
    edit_distance_score = 0
    accuracy = 0.0
    for i in range(int(floor(j * len(a)) / 5.0),
                   int(floor((j + 1) * len(a)) / 5.0)):
        reference = [a[i]]
        candidate = s[i]
        score = sentence_bleu(reference, candidate)
        net_score += score
        total_words += 1
        sm = edit_distance.SequenceMatcher(a=a[i], b=s[i])
        edit_distance_score += sm.ratio()
        if a[i] == s[i][:-1]:
            accuracy += 1
    net_accuracy.append(accuracy / total_words)
    edit_distance_output.append(edit_distance_score / total_words)
    output.append(net_score / total_words)

print(output)
plt.plot([1, 2, 3, 4, 5], output, 'ro')
plt.axis([0, 6, 0.7, 0.8])
plt.xlabel('Test Set')
plt.ylabel('Bleu Score')
plt.show()

print(net_accuracy)

Esempio n. 29

0

Mostra file

def editDistance(s1, s2):
    return edit_distance.SequenceMatcher(a=s1, b=s2).distance()

Esempio n. 30

0

Mostra file

def codeBookGen():

	#dic = {"1":"ATTC","2":"ACTA","3":"ATTA","4":"TATA","5":"AATC","6":"ACAA","7":"TTTC","0":"TCTA",}

	#generate all possible sequences of length 4
	pool = []

	print("Making dictionary")

	for i in range(256):
		temp = format(i,'08b')	
		str2 = ""
		x = ""
		
		for j in range(4):
			x = temp[2*j:2*j+2]
			#print(x)
			if x == '00':
				str2 += 'A'
			elif x == '01':
				str2 += 'C'
			elif x == '10':
				str2 += 'G'
			else :
				str2 += 'T'


			#print(temp)	
		#print(temp)
		#print(str2)
		pool.append(str2);
		
	len(pool)
	#pool has been created
	#now test for repetitivenesss

	pool1 = []
	for str2 in pool:
		myset = set()
		
		#print(str2)
		
		for i in range(16):
			ss = ""		#subsequence
			b = format(i,'04b')
			for j in range(4):
				if b[j] == '1':
					ss += str2[j]
				
			#print(ss)
			myset.add(ss)

		#print(len(myset))
		
		ratio = len(myset)/16	

		if ratio > 0.75:
			pool1.append(str2)
			#print(myset)
			#print(str2)
		#time.sleep(10)		
		
	#print(len(pool1))
		
	#The sequences with high repititiveness have been removed.
	#Now we will remove ones with undesirable GC content(<40 or >60)
	"""
	pool2 = []

	for str2 in pool1:
		countGC = str2.count('G') + str2.count('C')
		if countGC == 2:
			pool2.append(str2)
			print(str2)
		
	print(len(pool2))	
	"""
	pool2 = pool1				#REMOVE THIS IF GC CONTENT CONSTRAINT NEEDS TO BE INCLUDED

	no = 0

	while no != 8:
		
	#now we use edit distance constraint
		x = random.randint(0,len(pool2)-1)
		codewords = []
		codewords.append(pool2[x])

		str1 = codewords[0]


		pool3 = []
		temppool = pool2

		for i in range(len(pool2)):
			str1 = codewords[i]
			for str2 in temppool:
				dist = edit_distance.SequenceMatcher(a=str2,b=str1).distance()
				if  dist >= 3:
					pool3.append(str2)	
				"""
				else:
					print(dist)
					print(str2,str1)
				"""	
			#print('This is the length of pool 3:',len(pool3))
			if len(pool3) == 0:
				break
					
			x = random.randint(0,len(pool3)-1)
			codewords.append(pool3[x])
			temppool = pool3
			pool3 = []
				
		no = len(codewords)
	#	print(codewords)	
	return codewords