Esempio n. 1
0
    def fix_sequence(self):
        """
        Remove internal primer and restriction sites from the coding sequence.

        Returns
        -------
        None.
        """
        to_exclude = [
            self.typeIIs, self.asmf_re, self.asmr_re, self.gsp_f, self.gsp_r,
            self.asm_f, self.asm_r, 'AAAAA', 'GGGGG', 'CCCCC', 'TTTTT'
        ]
        to_exclude.extend([utils.rev_comp(subseq) for subseq in to_exclude])

        primers = [
            self.gsp_f,
            utils.rev_comp(self.gsp_r), self.asm_f,
            utils.rev_comp(self.asm_r)
        ]

        gc_limits = [35, 65]

        good_seq = False

        while not good_seq:
            bad_codons = set()
            for subseq in to_exclude:
                bad_site = self.nt_seq.find(subseq)
                if bad_site > -1:
                    positions = np.arange(bad_site, bad_site + len(subseq))
                    bad_codons.update(utils.get_codons(positions))

            for primer in primers:
                match_len, bad_nt_pos, _ = utils.lcs(self.nt_seq, primer)
                if match_len > 10:
                    bad_codons.update(utils.get_codons(bad_nt_pos))

                # primers are single stranded, but the templates are not (after one cycle, at least)
                match_len, bad_nt_pos, _ = utils.lcs(
                    utils.rev_comp(self.nt_seq), primer)
                if match_len > 10:
                    bad_codons.update(utils.get_codons(bad_nt_pos))

            if len(bad_codons) == 0:
                good_seq = 1
            else:
                to_fix = self.rng.choice(list(bad_codons))
                self.sample_new_codon(to_fix)

            # no restriction sites but a bad GC content... pick a random site to change
            if good_seq and not (gc_limits[0] <= GC(self.nt_seq) <=
                                 gc_limits[1]):
                good_seq = 0
                to_fix = self.rng.choice(np.arange(len(self.codons)))
                self.sample_new_codon(to_fix)

        return
def align_podcast_tokens(args, df):
    """Align the embeddings tokens with datum (containing onset/offset)

    Args:
        args (Namespace): namespace object containing project parameters
        df (DataFrame): embeddings dataframe

    Returns:
        df (DataFrame): aligned/filtered dataframe (goes into encoding)
    """
    DATA_DIR = os.path.join(os.getcwd(), 'data', args.project_id)
    cloze_file = os.path.join(DATA_DIR, 'podcast-datum-cloze.csv')

    cloze_df = pd.read_csv(cloze_file, sep=',')
    words = list(map(str.lower, cloze_df.word.tolist()))

    model_tokens = df['token2word'].tolist()

    # Align the two lists
    mask1, mask2 = lcs(words, model_tokens)

    cloze_df = cloze_df.iloc[mask1, :].reset_index(drop=True)
    df = df.iloc[mask2, :].reset_index(drop=True)

    df_final = pd.concat([df, cloze_df], axis=1)
    df = df_final.loc[:, ~df_final.columns.duplicated()]

    return df
def map_attributes_to_mesh(value, banned_columns=None):
    assert type(
        value
    ) == str, "Value passed to 'map_attributes_to_mesh' must be of type <str>"

    data = {'term': value, 'terminology': 'text', 'result_format': 'json'}

    response = requests.post(CATALOGUE_TRANSLATE_URL, data=data)

    if response.status_code != 200:
        return None

    maximum_ed = None
    mesh_code = None

    for mesh_candidate in response.json():
        if mesh_candidate[
                'mesh_code'] not in banned_columns or banned_columns is None:
            if mesh_candidate['mesh_code'].startswith("D"):
                current_ed = 1000
            else:
                current_ed = lcs(value, mesh_candidate['mesh_label'])
            if maximum_ed is None or current_ed > maximum_ed:
                maximum_ed = current_ed
                mesh_code = mesh_candidate['mesh_code']

    if mesh_code is not None:
        return mesh_code
    else:
        return value
Esempio n. 4
0
def get_candidate(k2,candidates,ref_scores=None):
    k = utils.accent2bare(k2)
    l_bares = []
    for c in candidates:
        l_bares.append(utils.accent2bare(c))

    dc = len(k)*1.0
    l_sims = []
    i = -1
    for bare in l_bares:
        i += 1
        count = len(utils.lcs(k,bare))*1.0
        if k2[0] == candidates[i][0]:
            count += 0.1

        count += math.log(100.0+ref_scores[i])/math.log(1000)

        l_sims.append(count/dc)

    #print l_sims

    sorted_scores,sorted_indices = utils.sort_array_indices(l_sims)
    l_true_candidates = []
    for ind in sorted_indices:
        l_true_candidates.append(candidates[ind])
    return l_true_candidates,sorted_scores
Esempio n. 5
0
def tag_brand_name(pt_text, attr_vals):
    output = None

    if 'brand' in attr_vals:
        output = str(utils.lcs(pt_text, attr_vals['brand']))

    return output
Esempio n. 6
0
def cal_same_terms(ori_quests, cand_quests, features):
    """
    the number of same terms between original question and candicate question
    """
    for idx in np.arange(len(ori_quests)):
        ori_quest = [each for each in ori_quests[idx]]
        cand_quest = [each for each in cand_quests[idx]]
        _, commList = lcs(ori_quest, cand_quest)
        score = float(len(commList)) / len(ori_quest)
        features[idx].append(score)
Esempio n. 7
0
def cal_max_similarity_term(ori_quests, cand_quests, features):
    """
    the length of the same terms between original question and candicate question
    """
    for idx in np.arange(len(ori_quests)):
        ori_quest = [each for each in ori_quests[idx]]
        cand_quest = [each for each in cand_quests[idx]]
        _, commList = lcs(ori_quest, cand_quest)
        commLength, oriLen = 0, 0
        for each in commList:
            commLength += len(each)
        for each in ori_quest:
            oriLen += len(each)
        score = float(commLength) / oriLen
        features[idx].append(score)
Esempio n. 8
0
def tag_processor_type(pt_text, attr_vals):
    output = None

    if 'processor_type' in attr_vals:
        proc = attr_vals['processor_type']
        if len(proc) > 0:
            pattern = '(' + utils.lcs(
                pt_text,
                proc) + ')\s*(([A-Za-z][0-9]+[- ])?\s*[0-9]{4}([A-Za-z]+)?)?'
            pattern = re.compile(pattern, re.IGNORECASE)

            for m in pattern.finditer(pt_text):
                output = pt_text[m.start():m.start() + len(m.group(0))]
                break

    return output
Esempio n. 9
0
def predict_lcs(lcs_classificator, nn_tree, testSample, use_min=False):
    testSample = nn_tree.query(testSample, k=1, return_distance=False)
    testSample = [sample[0] for sample in testSample]

    scores = []
    for cl in lcs_classificator.classes:
        class_score = []
        for sample in cl:

            intersection = set(testSample).intersection(sample)
            test = filter(lambda x: x in intersection, testSample)
            sample = filter(lambda x: x in intersection, sample)

            if len(intersection) > 0:
                class_score.append(lcs(test, sample))
            else:
                class_score.append(0.0)

        print scores
        scores.append(sum(np.array(class_score)) / float(len(cl)))

    return np.argmax(scores)
Esempio n. 10
0
    def split_kmers(self,
                    min_overlap=16,
                    max_overlap=35,
                    min_tm=59,
                    max_tm=64,
                    min_gc=40,
                    max_gc=60,
                    min_ddg=-3,
                    min_dimer=-9):
        """
        Split the coding sequnce

        Parameters
        ----------
        (int) min_overlap -- minimum bp in overlap sequences. Default 16
        (int) max_overlap -- maximum bp in overlap sequences. Default 35
        (float) min_tm -- minimum melting temp in overlap sequences. Default 59
        (float) max_tm -- maximum melting temp in overlap sequences. Default 64
        (float) min_gc -- minimum %GC content in overlap sequences. Default 40
        (float) max_gc -- maximum %GC content in overlap sequences. Default 60
        (float) min_ddg=-3 -- minimum hairpin/secondary structure deltaG from ViennaRNA RNAfold (DNA parameters)
                              If None, do not check this
        (float) min_dimer=-9 -- minimum self-association deltaG from ViennaRNA RNAduplex (DNA parameters)
                              If None, do not check this
    
        Returns
        -------
        None. Oligos stored in oligos field of Gene object.
        """

        # calculate expected number of oligos necessary so that length can be equal-ish
        tot_assembled = len(self.aa_seq)*3+len(self.asm_f)+len(self.asmf_re) + \
                        len(self.asmr_re)+len(self.asm_r) # total sequence that needs to be split into fragments
        available_nt = self.oligo_size - len(self.gsp_f) - len(
            self.gsp_r) - 2 * len(
                self.typeIIs)  # non-constant oligo region size
        expected_overlap_nt = (
            np.ceil(tot_assembled / available_nt) - 1
        ) * max_overlap  # expected number of bp doubly represented due to overlaps
        expected_oligos = np.ceil(
            (tot_assembled + expected_overlap_nt) /
            available_nt)  # expected number of oligos including overlap region
        target_length = int(
            (tot_assembled + expected_overlap_nt) // expected_oligos
        )  # length to target per fragment to get roughly equal lengths

        # The basic gist of how this works is you start with the full sequence, which you have already determined the nucleotides
        # for (this is the main difference from Bill's code). You take as much of that sequence as you can to fill the current
        # oligo, cut it back until you get a GC on the 3' end, then work backwards until you get a good overlap. If you can't get
        # a good overlap (usually due to GC content), start over with a different max length. Max lengths go from n, n-1, n+1, n-2, n+2, ...
        # If the max length becomes greater than the allowed length (or shorter than max_overlap+2, but usually the former happens first),
        # can't assemble the sequence. Try with a different random seed to produce a different sequence.
        # Note: the former constraint can be relaxed to try some shorter lengths too, but seems to work OK for now

        all_oligos = False  # keep track of progress
        curr_max = target_length  # max number of bp allowed in single oligo
        curr_oligo = self.asm_f + self.asmf_re + self.nt_seq + self.stop + self.asmr_re + utils.rev_comp(
            self.asm_r)
        next_oligo = ""
        while not all_oligos:

            if len(curr_oligo) > curr_max:
                # just take what will fit on the current oligo, save the
                # rest for later
                next_oligo = curr_oligo[curr_max:]
                curr_oligo = curr_oligo[:curr_max]

            if len(next_oligo) == 0:  # don't need to find an overlap bc done

                self.oligos.append(curr_oligo)

                # check the assembly
                gene = ""
                badoverlap = False
                for i in range(len(self.oligos)):
                    if i == 0:
                        gene += self.oligos[i]
                    else:
                        common = self.oligos[i].find(self.oligos[i - 1][-10:])
                        if common < 0:
                            badoverlap = True
                            break
                        gene += self.oligos[i][common + 10:]

                # check for additional overlap sites between different oligos
                for i, olap in enumerate(self.overlaps):
                    for j, oligo in enumerate(self.oligos):
                        # overlap i corresponds to oligo i and oligo i+1
                        if j == i:  # overlap at the end
                            true_occur = oligo.find(olap)
                            trimmed_seq = oligo[:true_occur]
                        elif j == i + 1:  #overlap at the beginning
                            true_occur = oligo.find(olap)
                            trimmed_seq = oligo[true_occur + len(olap):]
                        else:
                            trimmed_seq = oligo

                        match_len_fwd, _, _ = utils.lcs(trimmed_seq, olap)
                        match_len_rev, _, _ = utils.lcs(
                            trimmed_seq, utils.rev_comp(olap))
                        if match_len_fwd > 10 or match_len_rev > 10:
                            print("Bad overlap due to possible mispriming.")
                            print("Oligo %d overlap %d match %d bp" %
                                  (j, i, max(match_len_fwd, match_len_rev)))
                            badoverlap = True
                            break

                if badoverlap or \
                    gene != self.asm_f + self.asmf_re + self.nt_seq + \
                            self.stop + self.asmr_re + utils.rev_comp(self.asm_r):

                    if curr_max < target_length:
                        curr_max = 2 * target_length - curr_max
                    else:
                        curr_max = 2 * target_length - curr_max - 1

                    if curr_max > available_nt or curr_max < max_overlap + 2:
                        raise Exception(
                            "Couldn't find oligos with given framework, failed at assembly"
                        )

                    curr_oligo = self.asm_f + self.asmf_re + self.nt_seq + self.stop + self.asmr_re + utils.rev_comp(
                        self.asm_r)
                    self.oligos = []
                    self.overlaps = []
                    self.overlap_gc = []
                    self.overlap_tm = []
                else:
                    assert gene.count(self.asm_f) == 1, "Incorrect number AsmF"
                    assert gene.count(
                        self.asmf_re) == 1, "Incorrect number AsmF RE"

                    assert gene.count(utils.rev_comp(
                        self.asm_r)) == 1, "Incorrect number AsmR"
                    assert gene.count(
                        self.asmr_re) == 1, "Incorrect number AsmR RE"

                    all_oligos = True
                    continue

            # trim back to g or c
            while curr_oligo[-1] not in 'GC':
                next_oligo = curr_oligo[-1] + next_oligo
                curr_oligo = curr_oligo[:-1]

            # find the overlap
            overlap_pos = len(curr_oligo) - min_overlap + 1
            curr_tm = 0
            curr_gc = 0
            curr_ss_ddg = 10
            curr_dimer = 10
            while (curr_oligo[overlap_pos] not in 'GC'
                   or not min_tm <= curr_tm <= max_tm
                   or not min_gc <= curr_gc <= max_gc
                   or (min_ddg is not None and not curr_ss_ddg > min_ddg)
                   or (min_dimer is not None and not curr_dimer > min_dimer)):

                overlap_pos -= 1  # initial case accounted for in math above

                if overlap_pos < len(
                        curr_oligo
                ) - max_overlap or curr_tm > max_tm:  #Tm is never going to decrease
                    break
                    # no good overlap... try different max length and
                    # restart the loop

                # don't bother with expensive calcs if the loop is just going to fail anyway
                if curr_oligo[overlap_pos] not in 'GC':
                    continue

                temp_overlap = Seq(curr_oligo[overlap_pos:])
                # tm calculation with salt correction for KOD reaction
                curr_tm = mt.Tm_NN(temp_overlap, Mg=1.5, dNTPs=0.8)
                curr_gc = GC(temp_overlap)

                # ViennaRNA external software
                if min_ddg is not None:
                    curr_ss_ddg = utils.pred_ss_ddg(str(
                        temp_overlap))  # this calculation slows it down a LOT
                if min_dimer is not None:
                    curr_dimer = utils.pred_dimer(str(temp_overlap),
                                                  str(temp_overlap))

            if (curr_oligo[overlap_pos] not in 'GC'
                    or not min_tm <= curr_tm <= max_tm
                    or not min_gc <= curr_gc <= max_gc
                    or (min_ddg != None and not curr_ss_ddg > min_ddg)
                    or (min_dimer != None and not curr_dimer > min_dimer)):
                # this means the above loop broke, so try diff max length
                # and restart the loop
                if curr_max < target_length:
                    curr_max = 2 * target_length - curr_max
                else:
                    curr_max = 2 * target_length - curr_max - 1

                if curr_max > available_nt or curr_max < max_overlap + 2:
                    raise Exception(
                        "Couldn't find oligos with given framework, failed at melting temp"
                    )

                curr_oligo = self.asm_f + self.asmf_re + self.nt_seq + self.stop + self.asmr_re + utils.rev_comp(
                    self.asm_r)
                self.oligos = []
                self.overlaps = []
                self.overlap_gc = []
                self.overlap_tm = []

                continue

            # otherwise process the overlap!
            self.oligos.append(curr_oligo)
            self.overlaps.append(curr_oligo[overlap_pos:])
            self.overlap_gc.append(curr_gc)
            self.overlap_tm.append(curr_tm)

            curr_oligo = self.overlaps[-1] + next_oligo  # add in the overlap
            next_oligo = ""

        # add the gsps, type IIs, any buffer residues to everything

        full_oligos = []
        for i, oligo in enumerate(self.oligos):

            # add buffer between 3' GSP site, TypeIIs site to bring oligo
            # up to full size
            padding_size = available_nt - len(oligo)
            padding = self.rng.choice(['A', 'C', 'G', 'T'], padding_size)
            good_padding = False
            to_exclude = [
                self.typeIIs, self.asmf_re, self.asmr_re, self.gsp_f,
                self.gsp_r, self.asm_f, self.asm_r, 'AAAAA', 'GGGGG', 'CCCCC',
                'TTTTT'
            ]
            to_exclude.extend(
                [utils.rev_comp(subseq) for subseq in to_exclude])

            primers = [
                self.gsp_f,
                utils.rev_comp(self.gsp_r), self.asm_f,
                utils.rev_comp(self.asm_r)
            ]

            left_boundary = oligo[-4:] + utils.rev_comp(self.typeIIs)
            fixable_pos = set(
                range(len(left_boundary),
                      len(left_boundary) + padding_size))

            # make sure you're not getting any restriction/priming sites in the buffer bp that will
            # mess things up
            while not good_padding:
                subseq = left_boundary + "".join(padding) + utils.rev_comp(
                    self.gsp_r)[:5]

                bad_pos = set()
                for site in to_exclude:
                    # include boundaries
                    substr = subseq.find(site)
                    if substr >= 0:
                        bad_pos.update(
                            fixable_pos.intersection(
                                range(substr, substr + len(site))))

                for primer in primers:
                    match_len, bad_nt_pos, _ = utils.lcs(subseq, primer)
                    if match_len > 10:
                        bad_pos.update(fixable_pos.intersection(bad_nt_pos))

                    # primers are single stranded, but the templates are not (after one cycle, at least)
                    match_len, bad_nt_pos, _ = utils.lcs(
                        utils.rev_comp(subseq), primer)
                    if match_len > 10:
                        bad_pos.update(fixable_pos.intersection(bad_nt_pos))

                if len(bad_pos) == 0:
                    good_padding = True
                else:
                    to_fix = self.rng.choice(list(bad_pos))
                    padding[to_fix - len(left_boundary)] = self.rng.choice(
                        ['A', 'C', 'G', 'T'])

            padding = "".join(padding)

            complete_oligo = self.gsp_f + self.typeIIs + oligo + utils.rev_comp(self.typeIIs) + \
                             padding + utils.rev_comp(self.gsp_r)

            # already checked the full assembly, so now make sure nothing was accidentally introduced
            # at boundaries
            assert complete_oligo.count(
                self.gsp_f) == 1, "GSP F not found in %d -th oligo" % i
            assert complete_oligo.count(utils.rev_comp(
                self.gsp_r)) == 1, "GSP_R not found in %d -th oligo" % i
            assert complete_oligo.count(utils.rev_comp(
                self.gsp_f)) == 0, "GSP_F RC found in %d -th oligo" % i
            assert complete_oligo.count(
                self.gsp_r) == 0, "GSP_R RC found in %d -th oligo" % i

            if self.typeIIs == utils.rev_comp(self.typeIIs):
                assert complete_oligo.count(
                    self.typeIIs
                ) == 2, "Extra Type IIS sites in %d -th oligo" % i
            else:
                assert complete_oligo.count(
                    self.typeIIs
                ) == 1, "Extra Type IIS sites in %d -th oligo" % i
                assert complete_oligo.count(
                    utils.rev_comp(self.typeIIs)
                ) == 1, "Extra Type IIS sites in %d -th oligo" % i

            assert complete_oligo.count(utils.rev_comp(
                self.asm_f)) == 0, "AsmF RC in in %d -th oligo" % i
            assert complete_oligo.count(
                self.asm_r) == 0, "AsmR in %d -th oligo" % i
            if self.asmf_re != utils.rev_comp(self.asmf_re):
                assert complete_oligo.count(utils.rev_comp(
                    self.asmf_re)) == 0, "AsmF RE RC in %d -th oligo" % i
            if self.asmr_re != utils.rev_comp(self.asmr_re):
                assert complete_oligo.count(utils.rev_comp(
                    self.asmr_re)) == 0, "AsmR RE RC in %d -th oligo" % i

            full_oligos.append(complete_oligo)

        self.oligos = full_oligos

        return