Python lemmaの例、pattern.text.en.lemma Pythonの例

コード例 #1

0

ファイルを表示

def is_partial_match(query, table_names):
    query = lemma(query)
    table_names = [[lemma(x) for x in names.split(' ')]
                   for names in table_names]
    same_count = 0
    result = None
    for names in table_names:
        if query in names:
            same_count += 1
            result = names
    return result if same_count == 1 else False

コード例 #2

0

ファイルを表示

ファイル: feature_extractor.py プロジェクト: tuxedocat/Nyanco

 def _read_errorprob(self):
     try:
         prob_v = FeatureExtractor.dic_errorprobs[self.v]
     except KeyError:
         prob_v = FeatureExtractor.dic_errorprobs[en.lemma(self.v)]
     finally:
         pass

コード例 #3

0

ファイルを表示

ファイル: rephrase.py プロジェクト: infinitin/the_poetry_plagiarist

def get_rhyme_word(old_word, candidates, pos):
    #Find the candidates in the lexicon
    lemmas = [lemma(candidate['word']) for candidate in candidates]
    filtered_lemmas = filter_candidates(lemmas, pos)
    options = [candidate for candidate in candidates if lemma(candidate['word']) in filtered_lemmas]

    if not options:
        return ''

    best_options = [option['word'] for option in options if option['score'] == options[0]['score']]

    closest, score = most_similar(old_word, best_options, pos)
    if score > 2.5:
        return ''

    return closest

コード例 #4

0

ファイルを表示

ファイル: n_grams.py プロジェクト: infinitin/the_poetry_plagiarist

def agg_n_grams_by_line(poems, template):
    logging.info('Starting aggregator: agg_n_grams_by_line')
    #First extend all poems to the length of the longest poem
    max_len = max([len(poem.poem) for poem in poems])
    extended_poems = [(poem.poem + ['']*(max_len-len(poem.poem))) for poem in poems]
    #Then zip all together
    poem_lines = zip(*[poem for poem in extended_poems])

    #Then look a line at a time (so each first line of each poem, second line of each poem etc.)
    n_grams_by_line = []
    for line in poem_lines:
        n_grams = []
        for poem_line in line:
            #Now get the n_grams for this line for all n up to the length of the line and add it if not just stop words
            split_line = get_tokenized_words(poem_line)
            split_line = [lemma(word) for word in split_line]
            for n in range(1, len(split_line)):
                grams = ngrams(split_line, n)
                n_grams.extend([gram for gram in grams if len(set(gram) - stop_words)])
        n_grams_by_line.append(n_grams)

    #Now filter by the ones that actually occur with some significant frequency
    min_num_occurrences = round(len(poems) * 0.1)
    for n_grams_line in n_grams_by_line:
        counts = Counter(n_grams_line)
        template.n_grams_by_line.append([(' '.join(g for g in gram), count) for gram, count in counts.items() if count > min_num_occurrences])

    reduced_n_grams_by_line = []
    for entry in template.n_grams_by_line:
        reduced_n_grams_by_line.append(remove_redundant_substring_occurences(entry, min_num_occurrences))

    template.n_grams_by_line = reduced_n_grams_by_line

    logging.info('Aggregator finished: agg_n_grams_by_line')

コード例 #5

0

ファイルを表示

ファイル: cleand.py プロジェクト: youzipi/graduation-project

 def _collect_words_d(word_list):
     cleaned_list = []
     for w in word_list:  # type: str
         # 去除左右空格
         w = w.strip()
         w = lemma(w)
         cleaned_list.append(w)
     return cleaned_list

コード例 #6

0

ファイルを表示

ファイル: feature_extractor.py プロジェクト: tuxedocat/Nyanco

 def _find_verb_idx(self):
     verbpos = [idx for idx, sufpos in enumerate(zip(self.SUF, self.POS)) if sufpos[0] == self.v and "VB" in sufpos[1]]
     if verbpos:
         return verbpos[0]
     else:
         SUF_l = [en.lemma(w) for w in self.SUF]
         verbpos = [idx for idx, sufpos in enumerate(zip(SUF_l, self.POS)) if sufpos[0] == self.v and "VB" in sufpos[1]]
         if verbpos:
             return verbpos[0]
         else:
             return None

コード例 #7

0

ファイルを表示

 def access(self, uri, lemma):
     f = open(uri, 'r')
     for line in f:
         line = line.strip()
         if line != '':
             if lemma is not False:
                 line = en.lemma(line)
             if self.vocab_dict.get(line):
                 self.vocab_dict[line] += 1
             else:
                 self.vocab_dict[line] = 1
     self.size = len(self.vocab_dict)
     f.close()

コード例 #8

0

ファイルを表示

ファイル: dataCleaning.py プロジェクト: WellAlreadyTaken/vocab-statistics

def word_element(file_urls=()):
    for url in file_urls:
        result = list()
        file = open(const.clean_path + url, 'r', encoding='utf-8')
        fp = open(const.element_path + url, 'w', encoding='utf-8')
        for line in file:
            words = re.split(r'\s', line)
            for word in words:
                if word.strip() != '':
                    s = en.lemma(word)
                    if s not in const.simple_word:
                        result.append(word + " " + s)
        file.close()
        fp.writelines([line + '\n' for line in result])
        fp.close()

コード例 #9

0

ファイルを表示

ファイル: feature_extractor.py プロジェクト: tuxedocat/Nyanco

 def srl(self, v_idx=None):
     try:
         if not v_idx:
             v_idx = self.v_idx
         self.tmp_ARG0 = []
         self.tmp_ARG1 = []
         self.tmp_PRED = defaultdict(dict)
         ARGS = [(l[FeatureExtractor.col_srlrel], l[FeatureExtractor.col_suf], 
                  l[FeatureExtractor.col_pos], l[FeatureExtractor.col_netag]) for l in self.tags 
                 if l[FeatureExtractor.col_srl] != "_" and int(l[FeatureExtractor.col_srl]) - 1 == v_idx]
         if ARGS:
             srlf = {FeatureExtractor.gen_fn(["SRL", t[0], en.lemma(t[1])]):1 for t in ARGS}
             # srlp = {FeatureExtractor.gen_fn(["SRL", t[0], en.lemma(t[1])+"/"+t[2]]):1 for t in ARGS}
             srln = {FeatureExtractor.gen_fn(["SRL", t[0], t[3]]):1 for t in ARGS if not t[3]=="_"}
             self.features.update(srlf)
             # self.features.update(srlp)
             self.features.update(srln)
     except Exception, e:
         logging.debug(pformat(e))

コード例 #10

0

ファイルを表示

ファイル: n_grams.py プロジェクト: infinitin/the_poetry_plagiarist

def agg_n_grams(poems, template):
    logging.info('Starting aggregator: agg_n_grams')
    n_grams_by_poem = []
    for poem in poems:
        full_poem = ''
        for line in poem.poem:
            full_poem += line + ' '

        n_grams = []
        split_poem = get_tokenized_words(full_poem)
        split_poem = [lemma(word) for word in split_poem]
        for n in range(1, len(split_poem)):
            grams = ngrams(split_poem, n)
            n_grams.extend([gram for gram in grams if len(set(gram) - stop_words)])
        n_grams_by_poem.extend(n_grams)

    #Now filter by the ones that actually occur with some significant frequency
    min_num_occurrences = round(len(poems) * 0.1)
    counts = Counter(n_grams_by_poem)
    template.n_grams.extend([(' '.join(g for g in gram), count) for gram, count in counts.items() if count > min_num_occurrences + 1])
    template.n_grams = remove_redundant_substring_occurences(template.n_grams, min_num_occurrences)
    logging.info('Aggregator finished: agg_n_grams')

コード例 #11

0

ファイルを表示

ファイル: word_count.py プロジェクト: youzipi/graduation-project

    def _collect_words(self, word_list):
        """
        数据清洗
        :param word_list:
        :return:
        """
        for w in word_list:

            # todo 判断数字
            if len(re.findall(is_num, w)) > 0:
                continue

            w_len = len(w)
            if w_len == 0:
                continue
            # 有些关键字不能 改成小写 如 C,R,直接保存,因为lemma 会 把所有单词lower
            # if w_len == 1:
            #     rank_list[w] += 1
            # 先小写,为了在stop_words 中筛掉
            if w_len > 1:
                w = w.lower()

            # 除去 stopwords
            if w in stopwords:
                continue

            w = lemma(w)
            # 再次除去 stopwords
            if w in stopwords:
                continue

            else:
                # 4 stem
                # w = stemmer.stem(w)
                # w = lemmatizer.lemmatize(w)
                # w = singularize(w)
                self.rank_list[w] += 1

コード例 #12

0

ファイルを表示

from pattern.text.en import conjugate, lemma, lexeme
print (lemma('gave'))
print (lexeme('gave'))

コード例 #13

0

ファイルを表示

def partial_match(query, table_name):
    query = [lemma(x) for x in query]
    table_name = [lemma(x) for x in table_name]
    if query in table_name:
        return True
    return False

コード例 #14

0

ファイルを表示

def alter_column0(datas):
    """
    Attach column * table
    :return: model_result_replace
    """
    zero_count = 0
    count = 0
    result = []
    for d in datas:
        if 'C(0)' in d['model_result']:
            pattern = regex.compile('C\(.*?\) T\(.*?\)')
            result_pattern = list(set(pattern.findall(d['model_result'])))
            ground_col_labels = []
            for pa in result_pattern:
                pa = pa.split(' ')
                if pa[0] != 'C(0)':
                    index = int(pa[1][2:-1])
                    ground_col_labels.append(index)

            ground_col_labels = list(set(ground_col_labels))
            question_arg_type = d['question_arg_type']
            question_arg = d['question_arg']
            table_names = [[lemma(x) for x in names.split(' ')]
                           for names in d['table_names']]
            origin_table_names = [[
                wordnet_lemmatizer.lemmatize(x.lower())
                for x in names.split(' ')
            ] for names in d['table_names']]
            count += 1
            easy_flag = False
            for q_ind, q in enumerate(d['question_arg']):
                q = [lemma(x) for x in q]
                q_str = " ".join(" ".join(x) for x in d['question_arg'])
                if 'how many' in q_str or 'number of' in q_str or 'count of' in q_str:
                    easy_flag = True
            if easy_flag:
                # check for the last one is a table word
                for q_ind, q in enumerate(d['question_arg']):
                    if (q_ind > 0 and q == ['many']
                            and d['question_arg'][q_ind - 1] == ['how']) or (
                                q_ind > 0 and q == ['of']
                                and d['question_arg'][q_ind - 1] == ['number']
                            ) or (q_ind > 0 and q == ['of'] and
                                  d['question_arg'][q_ind - 1] == ['count']):
                        re = multi_equal(question_arg_type, q_ind, ['table'],
                                         2)
                        if re is not False:
                            # This step work for the number of [table] example
                            table_result = table_names[
                                origin_table_names.index(question_arg[re])]
                            result.append(
                                (d['query'], d['question'], table_result, d))
                            break
                        else:
                            re = multi_option(question_arg, q_ind,
                                              d['table_names'], 2)
                            if re is not False:
                                table_result = re
                                result.append((d['query'], d['question'],
                                               table_result, d))
                                pass
                            else:
                                re = multi_equal(question_arg_type, q_ind,
                                                 ['table'],
                                                 len(question_arg_type))
                                if re is not False:
                                    # This step work for the number of [table] example
                                    table_result = table_names[
                                        origin_table_names.index(
                                            question_arg[re])]
                                    result.append((d['query'], d['question'],
                                                   table_result, d))
                                    break
                                pass
                                table_result = random_choice(
                                    question_arg=question_arg,
                                    question_arg_type=question_arg_type,
                                    names=table_names,
                                    ground_col_labels=ground_col_labels,
                                    q_ind=q_ind,
                                    N=2,
                                    origin_name=origin_table_names)
                                result.append((d['query'], d['question'],
                                               table_result, d))

                                zero_count += 1
                        break

            else:
                M_OP = False
                for q_ind, q in enumerate(d['question_arg']):
                    if M_OP is False and q in [['than'], ['least'], ['most'], ['msot'], ['fewest']] or \
                            question_arg_type[q_ind] == ['M_OP']:
                        M_OP = True
                        re = multi_equal(question_arg_type, q_ind, ['table'],
                                         3)
                        if re is not False:
                            # This step work for the number of [table] example
                            table_result = table_names[
                                origin_table_names.index(question_arg[re])]
                            result.append(
                                (d['query'], d['question'], table_result, d))
                            break
                        else:
                            re = multi_option(question_arg, q_ind,
                                              d['table_names'], 3)
                            if re is not False:
                                table_result = re
                                #                             print(table_result)
                                result.append((d['query'], d['question'],
                                               table_result, d))
                                pass
                            else:
                                #                             zero_count += 1
                                re = multi_equal(question_arg_type, q_ind,
                                                 ['table'],
                                                 len(question_arg_type))
                                if re is not False:
                                    # This step work for the number of [table] example
                                    table_result = table_names[
                                        origin_table_names.index(
                                            question_arg[re])]
                                    result.append((d['query'], d['question'],
                                                   table_result, d))
                                    break

                                table_result = random_choice(
                                    question_arg=question_arg,
                                    question_arg_type=question_arg_type,
                                    names=table_names,
                                    ground_col_labels=ground_col_labels,
                                    q_ind=q_ind,
                                    N=2,
                                    origin_name=origin_table_names)
                                result.append((d['query'], d['question'],
                                               table_result, d))

                                pass
                if M_OP is False:
                    table_result = random_choice(
                        question_arg=question_arg,
                        question_arg_type=question_arg_type,
                        names=table_names,
                        ground_col_labels=ground_col_labels,
                        q_ind=q_ind,
                        N=2,
                        origin_name=origin_table_names)
                    result.append((d['query'], d['question'], table_result, d))

    for re in result:
        table_names = [[lemma(x) for x in names.split(' ')]
                       for names in re[3]['table_names']]
        origin_table_names = [[x for x in names.split(' ')]
                              for names in re[3]['table_names']]
        if re[2] in table_names:
            re[3]['rule_count'] = table_names.index(re[2])
        else:
            re[3]['rule_count'] = origin_table_names.index(re[2])

    for data in datas:
        if 'rule_count' in data:
            str_replace = 'C(0) T(' + str(data['rule_count']) + ')'
            replace_result = regex.sub('C\(0\) T\(.\)', str_replace,
                                       data['model_result'])
            data['model_result_replace'] = replace_result
        else:
            data['model_result_replace'] = data['model_result']

コード例 #15

0

ファイルを表示

ファイル: utils.py プロジェクト: terminalkitten/valuenet

def re_lemma(string):
    lema = lemma(string.lower())
    if len(lema) > 0:
        return lema
    else:
        return string.lower()

コード例 #16

0

ファイルを表示

ファイル: rephrase.py プロジェクト: infinitin/the_poetry_plagiarist

def replace(old_word, candidates, phrases):
    new_phrases = []
    #Find the word among the phrases, replace with candidate with same pos
    for phrase in phrases:
        if 'noun' in phrase.__dict__.keys():
            if phrase.noun == lemma(old_word):
                replacement = get_rhyme_word(old_word, candidates, 'N')
                if not replacement:
                    phrase.post_modifiers.append(phrase_spec.ADJ(get_rhyme_mod(old_word, candidates, 'A', 'N')))
                else:
                    phrase = phrase_spec.NP(replacement)

        if 'verb' in phrase.__dict__.keys():
            if phrase.verb == lemma(old_word):
                replacement = get_rhyme_word(old_word, candidates, 'V')
                if not replacement:
                    phrase.post_modifiers.append(phrase_spec.ADV(get_rhyme_mod(old_word, candidates, 'AVP', 'V')))
                else:
                    phrase = phrase_spec.VP(replacement)

        if 'np' in phrase.__dict__.keys():
            if phrase.np.noun == lemma(old_word):
                replacement = get_rhyme_word(old_word, candidates, 'N')
                if not replacement:
                    phrase.np.post_modifiers.append(
                        phrase_spec.ADJ(get_rhyme_mod(old_word, candidates, 'A', 'N')))
                else:
                    phrase.np = phrase_spec.NP(replacement)

        for pre_modifier in phrase.pre_modifiers:
            if 'adjective' in pre_modifier.__dict__.keys():
                if pre_modifier.adjective == lemma(old_word):
                    option_nodes = [get_node(candidate['word'], 'a') for candidate in candidates]
                    replacement_nodes = list(closest_matching([get_node(old_word, 'a')], option_nodes))
                    if replacement_nodes:
                        replacement = random.choice(replacement_nodes).id.split()[0]
                    else:
                        replacement = random.choice(candidates)['word']
                    new_pre_modifier = phrase_spec.ADJ(replacement)
                    pre_modifier_index = phrase.pre_modifiers.index(pre_modifier)
                    phrase.pre_modifiers[pre_modifier_index] = new_pre_modifier
            if 'adverb' in pre_modifier.__dict__.keys():
                if pre_modifier.adverb == lemma(old_word):
                    option_nodes = [get_node(candidate['word'], 'adv') for candidate in candidates]
                    replacement_nodes = list(closest_matching([get_node(old_word, 'adv')], option_nodes))
                    if replacement_nodes:
                        replacement = random.choice(replacement_nodes).id.split()[0]
                    else:
                        replacement = random.choice(candidates)['word']
                    new_pre_modifier = phrase_spec.ADJ(replacement)
                    pre_modifier_index = phrase.pre_modifiers.index(pre_modifier)
                    phrase.pre_modifiers[pre_modifier_index] = new_pre_modifier

        for modifier in phrase.modifiers:
            if 'adjective' in modifier.__dict__.keys():
                if modifier.adjective == lemma(old_word):
                    option_nodes = [get_node(candidate['word'], 'a') for candidate in candidates]
                    replacement_nodes = list(closest_matching([get_node(old_word, 'a')], option_nodes))
                    if replacement_nodes:
                        replacement = random.choice(replacement_nodes).id.split()[0]
                    else:
                        replacement = random.choice(candidates)['word']
                    new_modifier = phrase_spec.ADJ(replacement)
                    modifier_index = phrase.modifiers.index(modifier)
                    phrase.modifiers[modifier_index] = new_modifier
            if 'adverb' in modifier.__dict__.keys():
                if modifier.adverb == lemma(old_word):
                    option_nodes = [get_node(candidate['word'], 'adv') for candidate in candidates]
                    replacement_nodes = list(closest_matching([get_node(old_word, 'adv')], option_nodes))
                    if replacement_nodes:
                        replacement = random.choice(replacement_nodes).id.split()[0]
                    else:
                        replacement = random.choice(candidates)['word']
                    new_modifier = phrase_spec.ADJ(replacement)
                    modifier_index = phrase.modifiers.index(modifier)
                    phrase.modifiers[modifier_index] = new_modifier

        for post_modifier in phrase.post_modifiers:
            if 'adjective' in post_modifier.__dict__.keys():
                if post_modifier.adjective == lemma(old_word):
                    option_nodes = [get_node(candidate['word'], 'a') for candidate in candidates]
                    replacement_nodes = list(closest_matching([get_node(old_word, 'a')], option_nodes))
                    if replacement_nodes:
                        replacement = random.choice(replacement_nodes).id.split()[0]
                    else:
                        replacement = random.choice(candidates)['word']
                    new_post_modifier = phrase_spec.ADJ(replacement)
                    post_modifier_index = phrase.post_modifiers.index(post_modifier)
                    phrase.post_modifiers[post_modifier_index] = new_post_modifier
            if 'adverb' in post_modifier.__dict__.keys():
                if post_modifier.adverb == lemma(old_word):
                    option_nodes = [get_node(candidate['word'], 'adv') for candidate in candidates]
                    replacement_nodes = list(closest_matching([get_node(old_word, 'adv')], option_nodes))
                    if replacement_nodes:
                        replacement = random.choice(replacement_nodes).id.split()[0]
                    else:
                        replacement = random.choice(candidates)['word']
                    new_post_modifier = phrase_spec.ADJ(replacement)
                    post_modifier_index = phrase.post_modifiers.index(post_modifier)
                    phrase.post_modifiers[post_modifier_index] = new_post_modifier

        new_phrases.append(phrase)

    if not new_phrases:
        new_phrases = phrases

    return new_phrases

コード例 #17

0

ファイルを表示

ファイル: dataloader.py プロジェクト: Lyusungwon/StoryEndGen-pytorch

def collate_text(batch):
    batch_posts, batch_response = list(zip(*batch))

    max_post_len_list = [
        max([len(posts[i]) for posts in batch_posts]) + 2 for i in range(4)
    ]
    max_response_len = max([len(response) for response in batch_response]) + 2

    post_1, post_2, post_3, post_4 = [], [], [], []
    post_length_1, post_length_2, post_length_3, post_length_4 = [], [], [], []
    response = []
    response_length = []

    def padding(sent, length):
        """ Add sos and eos tokens, then pad sentence to length"""
        return ['_SOS'] + sent + ['_EOS'] + (['_PAD'] *
                                             (length - len(sent) - 2))

    for posts in batch_posts:
        post_1.append(padding(posts[0], max_post_len_list[0]))
        post_2.append(padding(posts[1], max_post_len_list[1]))
        post_3.append(padding(posts[2], max_post_len_list[2]))
        post_4.append(padding(posts[3], max_post_len_list[3]))

        post_1[-1] = list(map(transform, post_1[-1]))
        post_2[-1] = list(map(transform, post_2[-1]))
        post_3[-1] = list(map(transform, post_3[-1]))
        post_4[-1] = list(map(transform, post_4[-1]))

        post_length_1.append(len(posts[0]) + 2)
        post_length_2.append(len(posts[1]) + 2)
        post_length_3.append(len(posts[2]) + 2)
        post_length_4.append(len(posts[3]) + 2)

    for i in range(len(batch_response)):
        sample_response = batch_response[i]
        response.append(padding(sample_response, max_response_len))
        response[-1] = list(map(transform, response[-1]))
        response_length.append(len(sample_response) + 2)

    entity = [[], [], [], []]
    for posts in batch_posts:
        for i in range(4):
            entity[i].append([])
            for j in range(len(posts[i])):
                word = posts[i][j]
                try:
                    lemmatized = lemma(word)
                except UnicodeEncodeError:
                    lemmatized = word
                if lemmatized in relation:
                    entity[i][-1].append([
                        list(map(transform, triple))
                        for triple in relation[lemmatized]
                    ])
                else:
                    entity[i][-1].append([[4, 4, 4]])  # naf_h, naf_r, naf_t

    # entity[i][j][k][l] : lth triple with kth word in ith post of jth sample as head entity

    max_triple_len = [0, 0, 0, 0]
    # entity_length_list = []

    for i in range(4):
        for j in range(len(entity[i])):
            for k in range(len(entity[i][j])):
                if len(entity[i][j][k]) > max_triple_len[i]:
                    max_triple_len[i] = len(entity[i][j][k])

    entity_list = []
    entity_mask_list = []
    entity_length_list = []

    for i in range(4):
        entity_list.append(
            np.array(list(zip_longest(*entity[i], fillvalue=[[4, 4, 4]]))).T)

        entity_list[i] = np.array([
            np.pad(triples,
                   pad_width=((0, max_triple_len[i] - len(triples)), (0, 0)),
                   mode='constant',
                   constant_values=4) for sample in entity_list[i]
            for triples in sample
        ])

        entity_list[i] = entity_list[i].reshape(
            (len(batch), -1, max_triple_len[i], 3))
        pre_post_fix = np.full((len(batch), 1, max_triple_len[i], 3), 4)
        entity_list[i] = np.concatenate(
            (pre_post_fix, entity_list[i], pre_post_fix), axis=1)

    for i in range(4):
        entity_list[i] = torch.tensor(entity_list[i])
        entity_mask_list.append(entity_list[i][:, :, :, 0] == 4)
        entity_length_list.append(
            torch.sum((entity_list[i][:, :, :, 0] != 4), dim=2))

    batched_data = {
        'post_1': torch.tensor(post_1),  # (batch_size, max_post_1_len)
        'post_2': torch.tensor(post_2),
        'post_3': torch.tensor(post_3),
        'post_4': torch.tensor(post_4),
        'post_length_1': torch.tensor(post_length_1),  # (batch_size,)
        'post_length_2': torch.tensor(post_length_2),
        'post_length_3': torch.tensor(post_length_3),
        'post_length_4': torch.tensor(post_length_4),
        'response': torch.tensor(response),  # (batch_size, max_response_len)
        'response_length': torch.tensor(response_length),  # (batch_size,)
        'entity_1':
        entity_list[0],  # (batch_size, max_post_1_len, max_triple_num, 3)
        'entity_2': entity_list[1],
        'entity_3': entity_list[2],
        'entity_4': entity_list[3],
        'entity_mask_1':
        entity_mask_list[0],  # (batch_size, max_post_1_len, max_triple_num)
        'entity_mask_2': entity_mask_list[1],
        'entity_mask_3': entity_mask_list[2],
        'entity_mask_4': entity_mask_list[3],
        'entity_length_1':
        entity_length_list[0],  # (batch_size, max_post_1_len)
        'entity_length_2': entity_length_list[1],
        'entity_length_3': entity_length_list[2],
        'entity_length_4': entity_length_list[3]
    }

    return batched_data

コード例 #18

0

ファイルを表示

ファイル: VocabAnnotation.py プロジェクト: spell00/game_reviews_scraping

        def write_hypo(parent, count, list_of_neighbors):

            return_dict = {}

            for index in range(0, len(list_of_neighbors)):
                s = wordnet.synsets(list_of_neighbors[index])
                if len(s) > 0:
                    s = s[0]

                    synomyms = s.synonyms
                    hypernyms = s.hypernyms()
                    hyponyms = s.hyponyms()
                    holonyms = s.holonyms()
                    meronyms = s.meronyms()
                    singulars = [singularize(list_of_neighbors[index])]
                    plurals = [pluralize(list_of_neighbors[index])]
                    comparatives = [comparative(list_of_neighbors[index])]
                    superlatives = [superlative(list_of_neighbors[index])]
                    lemmas = [lemma(list_of_neighbors[index])]
                    lexemes = [lexeme(list_of_neighbors[index])]
                    tensess = [tenses(list_of_neighbors[index])]
                    suggests = [suggest(list_of_neighbors[index])]

                    neighbors_with_link_string = None

                    if parent in synomyms:
                        neighbors_with_link_string = str(
                            list_of_neighbors[index]) + "[SYNO]"
                    elif parent in hypernyms:
                        neighbors_with_link_string = str(
                            list_of_neighbors[index]) + "[HYPER]"
                    elif parent in hyponyms:
                        neighbors_with_link_string = str(
                            list_of_neighbors[index]) + "[HYPO]"
                    elif parent in holonyms:
                        neighbors_with_link_string = str(
                            list_of_neighbors[index]) + "[HOLO]"
                    elif parent in meronyms:
                        neighbors_with_link_string = str(
                            list_of_neighbors[index]) + "[MERO]"
                    elif parent in singulars:
                        neighbors_with_link_string = str(
                            list_of_neighbors[index]) + "[PLURAL]"
                    elif parent in plurals:
                        neighbors_with_link_string = str(
                            list_of_neighbors[index]) + "[SINGULAR]"
                    elif parent in comparatives:
                        neighbors_with_link_string = str(
                            list_of_neighbors[index]) + "[COMPA]"
                    elif parent in superlatives:
                        neighbors_with_link_string = str(
                            list_of_neighbors[index]) + "[SUPERLA]"
                    elif parent in lemmas:
                        neighbors_with_link_string = str(
                            list_of_neighbors[index]) + "[LEMMA]"
                    elif parent in lexemes:
                        neighbors_with_link_string = str(
                            list_of_neighbors[index]) + "[LEXEME]"
                    elif parent in tensess:
                        neighbors_with_link_string = str(
                            list_of_neighbors[index]) + "[TENSE]"
                    elif parent in suggests:
                        neighbors_with_link_string = str(
                            list_of_neighbors[index]) + "[MISPELL]"

                    if neighbors_with_link_string:
                        try:
                            return_dict[word][1].append(
                                neighbors_with_link_string)
                        except:
                            return_dict[word] = (count,
                                                 [neighbors_with_link_string])
            return return_dict

コード例 #19

0

ファイルを表示

ファイル: rephrase.py プロジェクト: infinitin/the_poetry_plagiarist

def extend_phrase(phrases, target_num_syllables, num_syllables):
    logging.info('Extending phrase')
    used = []
    #While less than:
    #Add adjectives and adverbs as modifiers with max missing number of syllables
    while num_syllables < target_num_syllables:
        added_specifier = False
        if target_num_syllables == num_syllables + 1:
            for phrase in phrases:
                if 'noun' in phrase.__dict__.keys():
                    if phrase.specifier is None:
                        phrase.specifier = 'the'
                        added_specifier = True
                        break
                elif 'np' in phrase.__dict__.keys():
                    if phrase.np.specifier is None:
                        phrase.np.specifier = 'the'
                        added_specifier = True
                        break
            if added_specifier:
                break
        if added_specifier:
            break

        changeable_phrases = []
        for phrase in phrases:
            try:
                if 'noun' in phrase.__dict__.keys() and phrase.noun[0].isupper():
                    continue
                else:
                    changeable_phrases.append(phrase)
            except IndexError:
                changeable_phrases.append(phrase)

        phrase_to_change = phrases.index(random.choice(changeable_phrases))
        pos = 'A'
        target_pos = 'N'
        if 'verb' in phrases[phrase_to_change].__dict__.keys():
            target_word = phrases[phrase_to_change].verb
            target_pos = 'V'
            pos = 'AVP'
        elif 'np' in phrases[phrase_to_change].__dict__.keys():
            target_word = phrases[phrase_to_change].np.noun
        else:
            target_word = phrases[phrase_to_change].noun

        #Need to check that it is <= the required number of syllables
        word = ''
        added_syllables = 0
        tries = 10
        while tries:
            try:
                word = get_property(lemma(target_word.split()[-1]), target_pos, used)
            except IndexError:
                word = get_random_word(pos)

            used.append(word)
            added_syllables = count_syllables([word])[0]
            if added_syllables <= (target_num_syllables - num_syllables):
                break
            tries -= 1

        if pos == 'A':
            modifier_phrase = phrase_spec.ADJ(word)
        else:
            modifier_phrase = phrase_spec.ADV(word)

        phrases[phrase_to_change].pre_modifiers.append(modifier_phrase)

        num_syllables += added_syllables

    return phrases