Esempio n. 1
0
    def query(self, ltoks, mingram, maxngram, nbest, sortByEDist, idx_tst):
        if idx_tst >= 0: #single sentence
            print("[{}]\t{}".format(idx_tst,' '.join(ltoks[0])))

        query_idx = []
        for toks in ltoks:
            query_idx.append(self.convert(toks))

        subphrases = self.getSubPhrase(query_idx, mingram, maxngram)
        counts = defaultdict(int)
        for subphrase in subphrases:
            result = self.getSentenceIds(subphrase)
            for idx_trn in result:
                counts[idx_trn] += 1

        if not sortByEDist: ### sort by counts
            sorted_counts = sorted(counts.items(), key = lambda x:x[1], reverse=True)
            for idx_trn, ngrams_count in sorted_counts[:nbest]:
                entry = []
                entry.append("{}".format(ngrams_count)) ### ngrams_counts
                trn_vec_idx = self.corpus[self.sentences[idx_trn]:self.sentences[idx_trn+1]-1]
                if idx_tst >= 0: #single sentence
                    sm = edit_distance.SequenceMatcher(a=query_idx[0], b=trn_vec_idx)
                    entry.append("{:.4f}".format(sm.ratio())) ### edit distance
                    entry.append("{}".format(idx_tst)) ### index tst
                entry.append("{}".format(idx_trn)) ### index trn
                entry.append(' '.join(self.convert(trn_vec_idx))) ### trn
                print('\t'.join(entry))

        else: ### sort by edit_distance
            if idx_tst == -1: 
                sys.stderr.write('error: -testSet cannot be used with -sortByEDist')
                sys.exit()

            edist = {} #defaultdict(float)
            mbest = nbest*10
            sorted_counts = sorted(counts.items(), key = lambda x:x[1], reverse=True)
            for idx_trn, ngrams_count in sorted_counts[:mbest]:
                trn_vec_idx = self.corpus[self.sentences[idx_trn]:self.sentences[idx_trn+1]-1]
                sm = edit_distance.SequenceMatcher(a=query_idx[0], b=trn_vec_idx)
                edist[idx_trn] = sm.ratio()

            sorted_edist = sorted(edist.items(), key = lambda x:x[1], reverse=True)
            for idx_trn, edist in sorted_edist[:nbest]:
                entry = []
                entry.append("{}".format(counts[idx_trn])) ### ngrams_counts
                trn_vec_idx = self.corpus[self.sentences[idx_trn]:self.sentences[idx_trn+1]-1]
                entry.append("{:.4f}".format(edist)) ### edit distance
                entry.append("{}".format(idx_tst)) ### index tst
                entry.append("{}".format(idx_trn)) ### index trn
                entry.append(' '.join(self.convert(trn_vec_idx))) ### trn
                print('\t'.join(entry))
Esempio n. 2
0
  def evaluate_all(self):
    num_samples = len(self.all_recognition_text)

    def _normalize_text(text):
      text = ''.join(filter(lambda x: x in (string.digits + string.ascii_letters), text))
      return text.lower()

    num_correct = 0
    num_incorrect = 0
    total_edit_distance = 0
    incorrect_pairs = []
    for i in range(num_samples):
      recogition = _normalize_text(self.all_recognition_text[i])
      groundtruth = _normalize_text(self.all_groundtruth_text[i])
      if recogition == groundtruth:
        num_correct += 1
      else:
        num_incorrect += 1
        incorrect_pairs.append((recogition, groundtruth))
      sm = edit_distance.SequenceMatcher(a=recogition, b=groundtruth)
      normalized_ed = sm.distance() / len(groundtruth)
      total_edit_distance += normalized_ed
    num_print = min(len(incorrect_pairs), 100)
    # print('*** Groundtruth => Prediction ***')
    # for i in range(num_print):
    #   recogition, groundtruth = incorrect_pairs[i]
    #   print('{} => {}'.format(groundtruth, recogition))
    # print('**********************************')
    case_insensitive_accuracy = 1.0 * num_correct / (num_correct + num_incorrect)

    metrics = {
      'WordAccuracy': case_insensitive_accuracy,
      'TotalEditDistance': total_edit_distance,
    }
    return metrics
Esempio n. 3
0
 def target_sentence_sampling(s_cluster, candidates, edit_distance_matrix,
                              sent2idx_dict):
     longest = -1
     longest_p = []
     longest_p_nearest_p = []
     for point in candidates:
         idx_p = sent2idx_dict[' '.join(point)]
         min_dist = 999999
         min_p = []
         for s_point in s_cluster:
             idx_sp = sent2idx_dict[' '.join(s_point)]
             if edit_distance_matrix[idx_p][idx_sp] >= 0:
                 dist = edit_distance_matrix[idx_p][idx_sp]
             else:
                 sm = edit_distance.SequenceMatcher(a=point, b=s_point)
                 dist = sm.distance()
                 edit_distance_matrix[idx_p][idx_sp] = dist
                 edit_distance_matrix[idx_sp][idx_p] = dist
             if dist < min_dist:
                 min_p = s_point
             min_dist = min(dist, min_dist)
         if min_dist > longest:
             longest = min_dist
             longest_p = point
             longest_p_nearest_p = min_p
     return longest_p, longest, longest_p_nearest_p
Esempio n. 4
0
    def update_centers(data_set, assignments, edit_distance_matrix,
                       sent2idx_dict):
        new_means = {}
        centers = []
        for assignment, point in zip(assignments, data_set):
            if assignment not in new_means:
                new_means[assignment] = [point]
            else:
                new_means[assignment].append(point)

        for center in new_means:
            points = new_means[center]
            shortest = 999999  # positive infinity
            shortest_p = []
            for i, point in enumerate(points):
                total_dist = 0
                for j, point2 in enumerate(points):
                    idx_p = sent2idx_dict[' '.join(point)]
                    idx_p2 = sent2idx_dict[' '.join(point2)]
                    if edit_distance_matrix[idx_p][idx_p2] >= 0:
                        dist = edit_distance_matrix[idx_p][idx_p2]
                    else:
                        sm = edit_distance.SequenceMatcher(a=point, b=point2)
                        dist = sm.distance()
                        edit_distance_matrix[idx_p][idx_p2] = dist
                        edit_distance_matrix[idx_p2][idx_p] = dist
                    total_dist += dist
                if total_dist < shortest:
                    shortest = total_dist
                    shortest_p = point
            centers.append(shortest_p)
        return centers
Esempio n. 5
0
def edrs(a, b):
    #Edit Distance with Real sequences
    a = a.tolist()
    b = b.tolist()

    sm = edit_distance.SequenceMatcher(a, b)
    return sm.distance()
Esempio n. 6
0
def get_similar_words(input_word, num_of_words):
    #print('get_similar_words')
    global vectors, ids

    p = np.array([nlp.vocab[input_word].vector])
    closest_index = distance.cdist(p, vectors)
    #print('closest_index',closest_index)
    output_list = []
    closest_indexes = closest_index.argsort()
    #print('closest_indexes',closest_indexes)
    closest_indexes = np.squeeze(closest_indexes)
    closest_indexes = closest_indexes[0:105]
    for i in closest_indexes:
        word_id = ids[i]
        output_word = nlp.vocab[word_id]
        output_word = output_word.text.lower()
        #print('in',type(input_word))
        #print('out',type(output_word))
        sm = edit_distance.SequenceMatcher(input_word.lower(),
                                           output_word.lower())
        levin_dist = sm.distance()
        if ((output_word.lower() != input_word.lower()) and (levin_dist > 2)):
            output_word = output_word
            output_list.append(output_word)
            if len(output_list) >= num_of_words:
                return output_list
    return output_list
def get_top_songs(artist):
    top_songs = []

    #might want to have this number be dynamic based on attribute ie. set time/popularity?
    song_count = 5

    artist_search = spotify.search(q='artist:' + artist, type='artist')

    if len(artist_search['artists']['items']) == 0:
        return []

    result_name = artist_search['artists']['items'][0]['name']

    #comparing edit distance might be useful
    if (result_name.lower() == artist.lower() or edit_distance.SequenceMatcher(
            artist.lower(), result_name.lower()).ratio() >= 66):

        artist_id = artist_search['artists']['items'][0]['id']
        top_tracks = spotify.artist_top_tracks(artist_id)['tracks']

        for track in top_tracks:
            top_songs.append(track['id'])

            if len(top_songs) >= song_count:
                break
    else:
        print("error could not find artist spotify name {} scraped name {}".
              format(result_name, artist))
    return top_songs
Esempio n. 8
0
def find_teachers(subject, target):
    with open(filename, 'r') as f:
        subjects_fo = json.load(f)
    teachers = list(subjects_fo[subject].keys())
    target = target.lower()
    teachers.sort(key=lambda t: edit_distance.SequenceMatcher(
        a=target, b=t.lower()).distance())
    return teachers
Esempio n. 9
0
 def similarity(self, predicted: List[int], targets: torch.LongTensor,
                target_mask: torch.LongTensor) -> float:
     # remove padding ones
     actual_len = target_mask.sum()
     targets_trimmed = targets[:actual_len]
     targets_trimmed = list(targets_trimmed.cpu().data.numpy())
     sm = edit_distance.SequenceMatcher(a=predicted, b=targets_trimmed)
     # get the edit distance similarity between two lists
     return sm.ratio()
Esempio n. 10
0
    def __call__(self, l1, l2):
        if (len(l1) == 1 and len(l1[0]) == 0) or (len(l2) == 1 and len(l2[0]) == 0):
            return 0.0, [''], ['']

        ### initially all discarded
        L1 = [self.u] * len(l1)
        L2 = [self.u] * len(l2)

        if self.lc: ### use .lower() or .casefold()
            sm = edit_distance.SequenceMatcher(a=[s.casefold() for s in l1], b=[s.casefold() for s in l2], action_function=edit_distance.highest_match_action)
        else:
            sm = edit_distance.SequenceMatcher(a=l1, b=l2, action_function=edit_distance.highest_match_action)

        for (code, b1, e1, b2, e2) in sm.get_opcodes():
            if code == 'equal': ### keep words
                L1[b1] = l1[b1]
                L2[b2] = l2[b2]
        return sm.ratio(), L1, L2
Esempio n. 11
0
def get_sequence_matches(sequence_1, sequence_2):
    if sequence_1 and sequence_2:
        sm = edit_distance.SequenceMatcher(a=sequence_1, b=sequence_2)
        sm.get_opcodes()
        sm.ratio()
        sm.get_matching_blocks()
        distance = sm.distance()
        num_matches = sm.matches()
        return num_matches
    else:
        return 0
Esempio n. 12
0
def calc_global_alignment(file_f,file_s):
    opcodes_f = getOpcodeForFile(file_f)
    opcodes_s = getOpcodeForFile(file_s)
    # alignments = pairwise2.align.globalms(opcodes_f,opcodes_s,2,-1,-0.5,-0.1, gap_char=["-"],one_alignment_only=True)
    #
    # for a in alignments:
    #     score = a[2]
    #     print (score)
    sm = edit_distance.SequenceMatcher(a=opcodes_f, b=opcodes_s)

    return (sm.ratio())
Esempio n. 13
0
 def cal_edit(self, data0, data1):
     edit_result = []
     for query in data0.keys():
         if query in data1.keys():
             list_0 = data0[query]
             list_1 = data1[query]
             sm = edit_distance.SequenceMatcher(list_0, list_1)
             edit_score = sm.distance()
             # print(edit_score)
             edit_result.append(edit_score)
     return edit_result
	def calcurate_edit_distance():
		ratio = 0.0
		ds = random.sample(test_data, 100)
		for fr,_,to in ds:
			result = model.translate([model.xp.array(fr)])[0]
			result = tree2normalizedsentense(result)
			to = tree2normalizedsentense(to)
			#print(to,result)
			ratio += 1.0 - edit_distance.SequenceMatcher(to,result).ratio()
		#print(ratio)
		ratio /= len(ds)
		return ratio
def get_neg_candidates_edit_distance(candidate_term, terms):
    closest_term_dist = 5
    closest_term = ""

    for term in terms:
        sequence_matcher = edit_distance.SequenceMatcher(a=candidate_term, b=term)
        edit_dist = sequence_matcher.distance()
        if edit_dist <= closest_term_dist and candidate_term != term:
            closest_term = term
            break

    return closest_term
Esempio n. 16
0
def parallel_decoding(data):
    posteriors, true_length, text, hmm = data
    posteriors = posteriors[:true_length]

    best_path, pstar = hmm.viterbi_decode(posteriors)
    word_seq = hmm.getTranscription(best_path)
    ref_seq = text.split(' ')

    # edit distance
    res = edit_distance.SequenceMatcher(a=ref_seq, b=word_seq)

    return word_seq, best_path, pstar, res.distance()
Esempio n. 17
0
def edit_dist_with_repl_similarity(tx_numb, rx_numb, word2numb):
    """ This function aligns two seq according to edit distance and then
     subtracts the similarity measure between replaced words from the edit distance.
     Wu Parmer similarity measure is used for this task.

    Args:
        tx_numb: the number representation of the tx sentence
        rx_numb: the number representation fo the rx sentence
        word2numb: word to numb object

    Returns:
        dist_measur: returns the distance measure
    """
    # get the word representation
    tx_txt = word2numb.convert_n2w(tx_numb)
    rx_txt = word2numb.convert_n2w(rx_numb)

    ed_aligned = edit_distance.SequenceMatcher(a=tx_numb, b=rx_numb)

    dist_measur = ed_aligned.distance()  # this is the edit distance

    indx_tx = 0
    indx_rx = 0
    # go through insertions and deletions and replacements in the alignment
    for i, op in enumerate(ed_aligned.get_opcodes()):
        # print(op)
        if op[0] == 'equal':
            indx_tx += 1
            indx_rx += 1
            continue
        elif op[0] == 'replace':  # if replacement discount similarity
            tx_syn = get_synset(pos_tag([tx_txt[indx_tx]]))
            rx_syn = get_synset(pos_tag([rx_txt[indx_rx]]))
            sim = 0
            if (tx_syn is not None) and (rx_syn is not None):
                sim = tx_syn.wup_similarity(
                    rx_syn)  # use Wu Palmer similarity measure
                if sim is None:
                    sim = 0
            dist_measur -= sim
            indx_tx += 1
            indx_rx += 1
        elif op[0] == 'delete':
            indx_tx += 1
        elif op[0] == 'insert':
            indx_rx += 1
        else:
            print("****************** ERROR ***************")
            break

    return dist_measur
Esempio n. 18
0
def cal_distance(label_list, pre_list):
    y = ed.SequenceMatcher(a = label_list, b = pre_list)
    yy = y.get_opcodes()
    insert = 0
    delete = 0
    replace = 0
    for item in yy:
        if item[0] == 'insert':
            insert += item[-1]-item[-2]
        if item[0] == 'delete':
            delete += item[2]-item[1]
        if item[0] == 'replace':
            replace += item[-1]-item[-2]  
    distance = insert+delete+replace     
    return distance, (delete, replace, insert)  
Esempio n. 19
0
def compute_pdists_in_docs(docs, length_ratio_threshold=None):
    pdists = []
    n = len(docs)
    for j in range(n - 1):
        for i in range(j + 1, n):
            len_i = len(docs[i])
            len_j = len(docs[j])
            len_ratio = min(len_i, len_j) / max(len_i, len_j)
            if (length_ratio_threshold is not None
                    and len_ratio < length_ratio_threshold):
                ratio = 1.0
            else:
                sm = edit_distance.SequenceMatcher(a=docs[j], b=docs[i])
                ratio = sm.distance() * 2 / (len_j + len_i)
            pdists.append(ratio)
    return pdists
Esempio n. 20
0
def validation(model, val_fn, decode_fn, datagen, mb_size=64):
    """ Validation routine for speech-models
    Params:
        model (keras.model): Constructed keras model
        val_fn (theano.function): A theano function that calculates the cost
            over a validation set
        datagen (DataGenerator)
        mb_size (int): Size of each minibatch
    Returns:
        val_cost (float): Average validation cost over the whole validation set
    """
    avg_cost = 0.0
    avg_acc = 0.0
    i = 0
    for batch in datagen.iterate_validation(mb_size):
        inputs = batch['x']
        labels = batch['y']
        input_lengths = batch['input_lengths']
        label_lengths = batch['label_lengths']
        texts = batch['texts']
        # print('labels:'+str(labels))
        # Due to convolution, the number of timesteps of the output
        # is different from the input length. Calculate the resulting
        # timesteps
        ctc_input_lens = ctc_input_length(model, input_lengths)
        # print('ctc_input_lens_pre:'+str(ctc_input_lens))
        prediction, ctc_cost = val_fn(
            [inputs, ctc_input_lens, labels, label_lengths, True])
        # print(labels)
        # prediction = np.swapaxes(prediction, 0, 1)
        predict_str = argmax_decode(prediction, decode_fn, ctc_input_lens)

        # print('predict_str:'+str(predict_str))
        avg_cost += ctc_cost.mean()
        print('predict_str:' + str(predict_str))
        print('texts:' + str(texts))
        acc_sum = 0
        for index, text in enumerate(texts):
            sm = edit_distance.SequenceMatcher(a=text, b=predict_str[index])
            acc = 1.0 - sm.distance() / len(text)
            acc_sum = acc_sum + acc
        avg_acc += acc_sum * 1.0 / (index + 1)
        i += 1
    if i == 0:
        return 0.0, 0.0
    return avg_cost / i, avg_acc / i
Esempio n. 21
0
def correctErrors(newStr, dic):
    for i in range(0, len(newStr), 4):
        str1 = newStr[i:i + 4]
        error = 1
        for j in range(8):
            if str1 == dic[str(j)]:
                error = 0
                break
        if error == 1:
            edit_dist = []
            for k in range(8):
                edit_dist.append(
                    edit_distance.SequenceMatcher(a=str1,
                                                  b=dic[str(k)]).distance())
            min_index = edit_dist.index(min(edit_dist))
            newStr = newStr[:i] + dic[str(min_index)] + newStr[i + 4:]
    return newStr
Esempio n. 22
0
def test(model, test_fn, decode_fn, datagen, mb_size=16, conv_context=11,
         conv_border_mode='valid', conv_stride=2):
    """ Testing routine for speech-models
    Params:
        model (keras.model): Constructed keras model
        test_fn (theano.function): A theano function that calculates the cost
            over a test set
        datagen (DataGenerator)
        mb_size (int): Size of each minibatch
        conv_context (int): Convolution context
        conv_border_mode (str): Convolution border mode
        conv_stride (int): Convolution stride
    Returns:
        test_cost (float): Average test cost over the whole test set
    """
    avg_cost = 0.0
    i = 0
    acc_list = []
    for batch in datagen.iterate_test(mb_size):
        inputs = batch['x']
        labels = batch['y']
        input_lengths = batch['input_lengths']
        label_lengths = batch['label_lengths']
        ground_truth = batch['texts']
        # Due to convolution, the number of timesteps of the output
        # is different from the input length. Calculate the resulting
        # timesteps
        # output_lengths = [conv_output_length(l, conv_context,
        #                                      conv_border_mode, conv_stride)
        #                   for l in input_lengths]
        ctc_input_lens = ctc_input_length(model, input_lengths)
        prediction, ctc_cost = test_fn([inputs, ctc_input_lens, labels,
                                        label_lengths, True])
        # predictions = np.swapaxes(predictions, 0, 1)
        prediction_str = argmax_decode(prediction, decode_fn, ctc_input_lens)
        for i, prediction in enumerate(prediction_str):
            truth = ground_truth[i]
            sm = edit_distance.SequenceMatcher(a=truth,b=prediction)
            acc = 1 - sm.distance()/len(truth) 
            acc_list.append(acc)
            print("Truth: {}, Prediction: {}, acc: {}".format(truth, prediction, acc))
    print(acc_list) 
    print('avg_acc:'+str(np.array(acc_list).mean()))
    return ''
Esempio n. 23
0
    def EDAlignment(self):
        self.tst2src = [-1] * len(
            self.tst
        )  ### this vector points to the corresponsing src word for each tst word (or -1 if there is no correspondence)
        self.src2tst = [-1] * len(self.src)
        sm = edit_distance.SequenceMatcher(self.tst, self.src)
        blocks = sm.get_matching_blocks()
        for block in blocks:
            self.tst2src[block[0]] = block[1]
            self.src2tst[block[1]] = block[0]

        if self.verbose:
            for x in range(len(self.tst2src)):
                if self.tst2src[x] != -1:
                    print('TST2SRC [{}:{} {}:{}]'.format(
                        x, self.tst[x], self.tst2src[x],
                        self.src[self.tst2src[x]]))

        return
Esempio n. 24
0
 def assign_points(data_points, centers, edit_distance_matrix,
                   sent2idx_dict):
     assignments = []
     for point in data_points:
         shortest = 999999  # positive infinity
         shortest_index = 0
         for i, center in enumerate(centers):
             idx_p = sent2idx_dict[' '.join(point)]
             idx_c = sent2idx_dict[' '.join(center)]
             if edit_distance_matrix[idx_p][idx_c] >= 0:
                 dist = edit_distance_matrix[idx_p][idx_c]
             else:
                 sm = edit_distance.SequenceMatcher(a=point, b=center)
                 dist = sm.distance()
                 edit_distance_matrix[idx_p][idx_c] = dist
                 edit_distance_matrix[idx_c][idx_p] = dist
             if dist < shortest:
                 shortest = dist
                 shortest_index = i
         assignments.append(shortest_index)
     return assignments
Esempio n. 25
0
    def test(self, dataset):
        # init stats
        E, N = 0, 0
        with tqdm(total=len(dataset),
                  bar_format='    {l_bar}{bar:30}{r_bar}') as pbar:
            for x, y, text, y_true_length in dataset.generator():
                posteriors = self.features_to_posteriors(x)
                # run viterbi to get recognized words
                best_path, pstar = self.hmm.viterbi_decode(posteriors)
                word_seq = self.hmm.getTranscription(best_path)
                # get original text
                ref_seq = text.split(' ')

                # edit distance
                res = edit_distance.SequenceMatcher(a=ref_seq, b=word_seq)
                E += res.distance()
                N += len(ref_seq)
                accuracy = (N - E) / N
                # update progress bar
                pbar.set_description(f'Test acc. {accuracy:.6f}')
                pbar.update(1)
        return accuracy
Esempio n. 26
0
def _get_operation_counts(
    source_string: str, destination_string: str
) -> Tuple[int, int, int, int]:
    """
    Check how many edit operations (delete, insert, replace) are required to
    transform the source string into the destination string. The number of hits
    can be given by subtracting the number of deletes and substitutions from the
    total length of the source string.

    :param source_string: the source string to transform into the destination string
    :param destination_string: the destination to transform the source string into
    :return: a tuple of #hits, #substitutions, #deletions, #insertions
    """

    #editops = Levenshtein.editops(source_string, destination_string)
    editops = edit_distance.SequenceMatcher(a=source_string, b=destination_string).get_opcodes()

    substitutions = sum(1 if op[0] == "replace" else 0 for op in editops)
    deletions = sum(1 if op[0] == "delete" else 0 for op in editops)
    insertions = sum(1 if op[0] == "insert" else 0 for op in editops)
    hits = len(source_string) - (substitutions + deletions)

    return hits, substitutions, deletions, insertions
        adv_digit = attack_dir.parent.parent.parent.name[-1]
        target_filename = attack_dir.parent.parent.parent.parent.name
        target_speaker = '-'.join(target_filename.split("-")[:-1])
        original_digit = target_filename[-2]

        E, N = 0, 0
        for test_filename, r in attack_res['test_res'].items():
            pred_word_seq, label_word_seq = r['pred_word_seq'], r['label_word_seq']
            label_word_seq = ' '.join([str(d) for d in tools.str_to_digits(label_word_seq.split())])
            pred_word_seq = ' '.join([str(d) for d in tools.str_to_digits(pred_word_seq.split())])

            if test_filename.startswith(target_speaker):
                continue
            else:
                res = edit_distance.SequenceMatcher(a=label_word_seq, b=pred_word_seq)
                E += res.distance()
                N += len(label_word_seq.split(" "))

        speaker_E, speaker_N = 0, 0
        speaker_target_file_num = 0
        speaker_succeeded_targets = []
        for test_filename, r in attack_res['speaker_res'].items():
            pred_word_seq, label_word_seq = r['pred_word_seq'], r['label_word_seq']
            label_word_seq = ' '.join([str(d) for d in tools.str_to_digits(label_word_seq.split())])
            pred_word_seq = ' '.join([str(d) for d in tools.str_to_digits(pred_word_seq.split())])

            if test_filename == target_filename:
                continue

            if test_filename.startswith(target_speaker):
Esempio n. 28
0
output = []
net_accuracy = []
edit_distance_output = []
for j in range(5):
    net_score = 0
    total_words = 0
    edit_distance_score = 0
    accuracy = 0.0
    for i in range(int(floor(j * len(a)) / 5.0),
                   int(floor((j + 1) * len(a)) / 5.0)):
        reference = [a[i]]
        candidate = s[i]
        score = sentence_bleu(reference, candidate)
        net_score += score
        total_words += 1
        sm = edit_distance.SequenceMatcher(a=a[i], b=s[i])
        edit_distance_score += sm.ratio()
        if a[i] == s[i][:-1]:
            accuracy += 1
    net_accuracy.append(accuracy / total_words)
    edit_distance_output.append(edit_distance_score / total_words)
    output.append(net_score / total_words)

print(output)
plt.plot([1, 2, 3, 4, 5], output, 'ro')
plt.axis([0, 6, 0.7, 0.8])
plt.xlabel('Test Set')
plt.ylabel('Bleu Score')
plt.show()

print(net_accuracy)
Esempio n. 29
0
def editDistance(s1, s2):
    return edit_distance.SequenceMatcher(a=s1, b=s2).distance()
Esempio n. 30
0
def codeBookGen():

	#dic = {"1":"ATTC","2":"ACTA","3":"ATTA","4":"TATA","5":"AATC","6":"ACAA","7":"TTTC","0":"TCTA",}

	#generate all possible sequences of length 4
	pool = []

	print("Making dictionary")

	for i in range(256):
		temp = format(i,'08b')	
		str2 = ""
		x = ""
		
		for j in range(4):
			x = temp[2*j:2*j+2]
			#print(x)
			if x == '00':
				str2 += 'A'
			elif x == '01':
				str2 += 'C'
			elif x == '10':
				str2 += 'G'
			else :
				str2 += 'T'


			#print(temp)	
		#print(temp)
		#print(str2)
		pool.append(str2);
		
	len(pool)
	#pool has been created
	#now test for repetitivenesss

	pool1 = []
	for str2 in pool:
		myset = set()
		
		#print(str2)
		
		for i in range(16):
			ss = ""		#subsequence
			b = format(i,'04b')
			for j in range(4):
				if b[j] == '1':
					ss += str2[j]
				
			#print(ss)
			myset.add(ss)

		#print(len(myset))
		
		ratio = len(myset)/16	

		if ratio > 0.75:
			pool1.append(str2)
			#print(myset)
			#print(str2)
		#time.sleep(10)		
		
	#print(len(pool1))
		
	#The sequences with high repititiveness have been removed.
	#Now we will remove ones with undesirable GC content(<40 or >60)
	"""
	pool2 = []

	for str2 in pool1:
		countGC = str2.count('G') + str2.count('C')
		if countGC == 2:
			pool2.append(str2)
			print(str2)
		
	print(len(pool2))	
	"""
	pool2 = pool1				#REMOVE THIS IF GC CONTENT CONSTRAINT NEEDS TO BE INCLUDED

	no = 0

	while no != 8:
		
	#now we use edit distance constraint
		x = random.randint(0,len(pool2)-1)
		codewords = []
		codewords.append(pool2[x])

		str1 = codewords[0]


		pool3 = []
		temppool = pool2

		for i in range(len(pool2)):
			str1 = codewords[i]
			for str2 in temppool:
				dist = edit_distance.SequenceMatcher(a=str2,b=str1).distance()
				if  dist >= 3:
					pool3.append(str2)	
				"""
				else:
					print(dist)
					print(str2,str1)
				"""	
			#print('This is the length of pool 3:',len(pool3))
			if len(pool3) == 0:
				break
					
			x = random.randint(0,len(pool3)-1)
			codewords.append(pool3[x])
			temppool = pool3
			pool3 = []
				
		no = len(codewords)
	#	print(codewords)	
	return codewords