def augment(inputs, outputs, tags, hallucinator, min_len=3, max_len=10): temp = [(''.join(inp), ''.join(out)) for inp, out in zip(inputs, outputs)] # aligned returns pairs of strings with spaces for null alignments aligned = align.Aligner(temp).alignedpairs new_inputs = [] new_outputs = [] new_tags = [] for k, (src, trg) in enumerate(aligned): good_ranges = find_good_range(src, trg, min_len, max_len) if good_ranges: new_src, new_trg = list(src), list(trg) for good_range in good_ranges: s, e = good_range gold_seq = new_src[s: e] hallucinated_seq = hallucinator.sample(gold_seq) new_src[s: e] = new_trg[s: e] = hallucinated_seq # trim, unless src and trg have an aligned whitespace new_i1 = [c for i, c in enumerate(new_src) if (c.strip() or (new_src[i] == new_trg[i] == ' '))] new_o1 = [c for i, c in enumerate(new_trg) if (c.strip() or (new_src[i] == new_trg[i] == ' '))] new_inputs.append(new_i1) new_outputs.append(new_o1) new_tags.append(tags[k]) return new_inputs, new_outputs, new_tags
def smart_align(pairs, align_symbol=ALIGN_SYMBOL, iterations=150, burnin=5, lag=1, mode='crp', **kwargs): return align.Aligner(pairs, align_symbol=align_symbol, iterations=iterations, burnin=burnin, lag=lag, mode=mode).alignedpairs
def augment(inputs, outputs, tags, characters): temp = [(''.join(inputs[i]), ''.join(outputs[i])) for i in range(len(outputs))] aligned = align.Aligner(temp, align_symbol=' ').alignedpairs vocab = list(characters) try: vocab.remove(u" ") except: pass new_inputs = [] new_outputs = [] new_tags = [] for k, item in enumerate(aligned): #print(''.join(inputs[k]) + '\t' + ''.join(outputs[k])) i, o = item[0], item[1] good_range = find_good_range(i, o) #print(good_range) if good_range: new_i, new_o = list(i), list(o) for r in good_range: s = r[0] e = r[1] if (e - s > 5): #arbitrary value s += 1 e -= 1 for j in range(s, e): if random() > 0.5: #arbitrary value nc = choice(vocab) new_i[j] = nc new_o[j] = nc new_i1 = [ c for l, c in enumerate(new_i) if (c.strip() or (new_o[l] == ' ' and new_i[l] == ' ')) ] new_o1 = [ c for l, c in enumerate(new_o) if (c.strip() or (new_i[l] == ' ' and new_o[l] == ' ')) ] new_inputs.append(new_i1) new_outputs.append(new_o1) new_tags.append(tags[k]) else: new_inputs.append([]) new_outputs.append([]) new_tags.append([]) return new_inputs, new_outputs, new_tags
def process(organism_ids, align_method, similarity_mode, power_alpha=cs.ALPHA_BIAS, check=True, visual=False): # load bio_net object bio_net = initialize_network(organism_ids, align_method, similarity_mode, power_alpha) # create aligner object aligner = align.Aligner(align_method) alignment = aligner.align(bio_net, check=check) # visualization if visual: visualize.gephi_organism_ppi(bio_net.org1) visualize.gephi_organism_ppi(bio_net.org2) visualize.gephi_network_aligned(alignment, bio_net) visualize.gephi_network_aligned_comp(alignment, bio_net) return alignment
def med_align(wordpairs, align_symbol): a = align.Aligner(wordpairs, align_symbol=align_symbol, mode='med') return a.alignedpairs
def mcmc_align(wordpairs, align_symbol): a = align.Aligner(wordpairs, align_symbol=align_symbol) return a.alignedpairs
citationforms = { c for c in citationforms if citationforms[c] > 4 and citationforms[c] / float(citationforms[c] + negcitationforms[c]) >= 0.95 } for l in lines2: msd1, form1, msd2, form2 = l.split(u'\t') if msd1 in citationforms and msd1 != msd2: traindata1.append((form1, form2, msd2)) if msd2 in citationforms and msd1 != msd2: traindata1.append((form2, form1, msd1)) if task == 1 or not constrained: wordpairs = [(x[0], x[1]) for x in traindata1] a = align.Aligner(wordpairs, align_symbol=u'_', iterations=30) traindata1 = [(traindata1[i][0], traindata1[i][1], traindata1[i][2], a.alignedpairs[i][0], a.alignedpairs[i][1]) for i in range(len(traindata1))] C, V = consvowOCP.candv(words) # Lemma > form if task == 1 or not constrained: for lemma, form, msd, lemmaaligned, formaligned in traindata1: if msd not in fromlemma: fromlemma[msd] = [] if msd not in tolemma: tolemma[msd] = [] alignedpair1 = (lemmaaligned, formaligned)
def mcmc_align(wordpairs, align_symbol): a = align.Aligner(wordpairs, align_symbol=align_symbol, random_seed=42) return a.alignedpairs