def forecast_token(text, masked_index, tokenizer, model): tokenized_text = ['[CLS]'] doc = nlp(text) tokenized_text.extend([token.text for token in doc]) tokenized_text.append('[SEP]') synonyms_ = get_candidate_tokens(tokenized_text[masked_index]) synonyms_ = list(set(synonyms_)) masked_token = tokenized_text[masked_index] token_polarity = int(Word(masked_token, language="en").polarity) ####### synonyms = [] for elem in synonyms_: if int(Word(elem, language="en").polarity) == token_polarity: synonyms.append(elem) # Mask a token that we will try to predict back with `BertForMaskedLM` tokenized_text[masked_index] = '[MASK]' # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) # Predict all tokens with torch.no_grad(): outputs = model(tokens_tensor) predictions = outputs[0] token_idxs = [ tokenizer.convert_tokens_to_ids([word])[0] for word in synonyms ] preds = np.array([predictions[0, masked_index, idx] for idx in token_idxs]) sort_top = preds.argsort() #predicted_index = token_idxs[sort_top[-1]] candiditate_tokens = [synonyms[sort_top[-1]], synonyms[sort_top[-2]]] candiditate_tokens = [] for nn in np.arange(len(preds)): if abs(preds[nn] - preds[sort_top[-1]]) < 0.0001: candiditate_tokens.append(synonyms[nn]) if masked_token in candiditate_tokens: # if the probability of masked token within top two, then think the masked token is correct. predicted_token, softmax_prob = masked_token, 100 else: predicted_token, softmax_prob = synonyms[sort_top[-1]], preds[ sort_top[-1]] # don't change the token if the predicted token is same at the original token # without consider the upper/lower case if masked_token.lower() == predicted_token.lower(): predicted_token = masked_token return predicted_token, softmax_prob
def polyglot_stem(): print "\nDerivational Morphemes using polyglot library" for w in words_derv: w = Word(w, language="en") print("{:<20}{}".format(w, w.morphemes)) print "\nInflectional Morphemes using polyglot library" for w in word_infle: w = Word(w, language="en") print("{:<20}{}".format(w, w.morphemes)) print "\nSome Morphemes examples using polyglot library" for w in word_infle: w = Word(w, language="en") print("{:<20}{}".format(w, w.morphemes))
def convertToPolyglotMorf(sentences, save=False): #List of str to List of str (morfemes) total_review_morf_text_list = [] i = 1 morfed_sentences = [] print(len(sentences)) for sentence in sentences: print(i) tokenized_sentence = ucto_tokenize(sentence) morfed_sentence = [] for w in tokenized_sentence: w = str(w) w = Word(w, language="nl") #print("{:<20}{}".format(w, w.morphemes)) morfed_sentence += w.morphemes #print(review_morf_list) morfed_sentences += morfed_sentence i += 1 morfed_sentences_text = '*%'.join(morfed_sentences) if save is True: with open("TrainFiles/convertedPolyglotMorfText.txt", "w") as text_file: text_file.write(morfed_sentences_text) return morfed_sentences
def get_sems(word, lang): print(word) w = Word(word.lower(), language=CODES[lang.lower()]) try: res = w.neighbors except Exception as e: logger.warning(e) return None return res
def test_polyglot1(self) : import polyglot from polyglot.text import Text, Word text = Text("Bonjour, Mesdames.") print("Language Detected: Code={}, Name={}\n".format(text.language.code, text.language.name)) text = Text("第一条 机动车第三者责任保险合同(以下简称本保险合同)由保险条款、投保单、保险单、批单和特别约定共同组成。 " "本保险合同争议处理适用中华人民共和国法律。") #print(text.entities) """ print("{:<16}{}".format("Word", "POS Tag")+"\n"+"-"*30) for word, tag in text.pos_tags: print(u"{:<16}{:>2}".format(word, tag)) """ word = Word("Obama", language="en") word = Word("中华人民共和国", language="zh") print("Neighbors (Synonms) of {}".format(word)+"\n"+"-"*30) for w in word.neighbors: print("{:<16}".format(w)) print("\n\nThe first 10 dimensions out the {} dimensions\n".format(word.vector.shape[0])) print(word.vector[:10])
def explore_parse_tree(tokens, tree_nodes): # for word token, obtain morphemes if len(tokens) == 1: w = Word(tokens.text, language='en') for morph in w.morphemes: if morph == tokens.text or morph == tokens.lemma_: continue tree_nodes.append((morph, 'morpheme', 1)) # add word itself tree_nodes.append((tokens.text, tokens._.labels, len(tokens))) if tokens._.labels == () or len(tokens) == 0: return 0 tree_nodes.append((tokens.text, tokens._.labels, len(tokens))) # constituency parsed nodes for child in tokens._.children: explore_parse_tree(child, tree_nodes)
def embeddings(): from polyglot.text import Word data = dict(default_data) data[ 'message'] = "Neighbours (Embeddings) - Find neighbors of word API - Parameters: 'word', 'lang' language (default: en)" params = {} params['word'] = request.args.get('word') params['lang'] = request.args.get('lang') if not params: data['error'] = 'Missing parameters' return jsonify(data) if not params['word']: data['error'] = '[word] parameter not found' return jsonify(data) if not params['lang']: # data['error'] = '[lang] parameter not found' # return jsonify(data) params['lang'] = 'en' data['neighbours'] = {} try: word = Word(params['word'], language=params['lang']) except KeyError: data['error'] = 'ERROR: word not found' return jsonify(data) if not word: data['error'] = 'word not found' return jsonify(data) data['neighbours'] = word.neighbors return jsonify(data)
def _morf(self, originalWord): words = [] w = Word(originalWord, language="he") morp = w.morphemes if len(morp) == 1: words.append(originalWord) return words else: notIn = [] for w in morp: if w not in morp_clean: notIn.append(w) if len(notIn) > 1: words.append(originalWord) return words hasInv = False for w2 in notIn: if len(w2) == 1: hasInv = True if hasInv: words.append(originalWord) return words else: s1 = set(morp) if len(s1) == len(morp): words.extend(morp) else: words.append(originalWord) return words
def _morf(text): words = [] w = Word(text, language="he") morp = w.morphemes # x if len(morp) == 1: words.append(text) return words else: notIn = [] for w in morp: if w not in x: notIn.append(w) if len(notIn) > 1: words.append(text) return words hasInv = False for w2 in notIn: if len(w2) == 1: hasInv = True if hasInv: words.append(text) return words else: s1 = set(morp) if len(s1) == len(morp): words.extend(morp) else: words.append(text) return words
from polyglot.text import Word import sys LANGUAGE = sys.argv[1] for line in sys.stdin: l = [] for w in line.split(): m = Word(w, language=LANGUAGE).morphemes if len(m) == 1: l.append(w) else: l.append("@@ ".join(m)) print(" ".join(l))
# full_train_text = ' '.join(FullTrainCorpusList) with open('PosTestData.data', 'rb') as filehandle: # read the data as binary data stream PosTestCorpusList = pickle.load(filehandle) ShorterCorpusList = PosTestCorpusList[:100] short_text = ' '.join(ShorterCorpusList) pos_test_text = ' '.join(PosTestCorpusList) print(downloader.supported_languages_table("morph2")) words = ["preprocessing", "processor", "invaluable", "thankful", "crossed"] for w in words: w = Word(w, language="en") print("{:<20}{}".format(w, w.morphemes)) # train_data = list(pos_test_text) # # io = morfessor.MorfessorIO() # # #train_data = list(io.read_corpus_file('training_data')) # # model = morfessor.BaselineModel() # # #model.load_data(train_data, count_modifier=lambda x: 1) # #def log_func(x): # # return int(round(math.log(x + 1, 2))) # #model_logtokens.load_data(train_data, count_modifier=log_func) # model.load_data(train_data)
def read_extra_features(split_dir, normal_wiki_ngram_2=None, normal_wiki_ngram_3=None, simple_wiki_ngram_2=None, simple_wiki_ngram_3=None, cbt_corpus_ngram_2=None, cbt_corpus_ngram_3=None, normal_wiki=None, simple_wiki=None, lexicon_dir=None, brown_dict=None, lang_8_corpus=None, tatoeba=None, cbt_corpus=None, nlp=None): if 'tsv' in split_dir: data = pd.read_csv(split_dir, sep='\t', quoting=csv.QUOTE_NONE) elif 'xlsx' in split_dir: data = pd.read_excel(split_dir) data.rename(columns={'subcorpus': 'corpus'}, inplace=True) data.token.fillna('null', inplace=True) print('Generating dependencies corpus') data['pos_label'] = data.apply( lambda x: get_meta(x.sentence, x.token, nlp, 'pos'), axis=1) data['sentence_pre'] = data.apply( lambda x: get_meta(x.sentence, x.token, nlp, 'text'), axis=1) #data['entities_sent'] = data.apply(lambda x: get_meta(x.sentence, x.token, nlp, 'ent'), axis=1) data['dep_target'] = data.apply( lambda x: get_meta(x.sentence, x.token, nlp, 'dep'), axis=1) extra_features = [] len_lang_8_words = len(lang_8_corpus.split(' ')) len_tatoeba = len(tatoeba.split(' ')) len_cbt = len(cbt_corpus.split(' ')) len_normal_wiki = len(normal_wiki.split(' ')) len_simple_wiki = len(simple_wiki.split(' ')) print('Generating auxiliar complexity') extra_lexicon = pd.read_csv(lexicon_dir, sep='\t', names=['token', 'complex_aux']) extra_lexicon['token_l'] = extra_lexicon['token'].str.lower() data['token_l'] = data['token'].str.lower() data_merged = pd.merge(data, extra_lexicon[['token_l', 'complex_aux']], on='token_l', how='left') def find_position(row): try: if ' ' in row.token: ix = row.sentence_pre.index(row.token.split(' ')) return [ix, ix + 1] else: return [row.sentence_pre.index(row.token)] except: if ' ' in row.token: token_find = row.token.split(' ')[0] else: token_find = row.token for ix, w in enumerate(row.sentence_pre): if token_find in w: if ' ' in row.token: return [ix, ix + 1] else: return [ix] if ' ' in row.token: token_find = row.token_l.split(' ')[0] else: token_find = row.token_l for ix, w in enumerate(row.sentence_pre): if token_find in w: if ' ' in row.token_l: return [ix, ix + 1] else: return [ix] print('Counting ...') data_merged['position'] = data_merged.parallel_apply( lambda x: find_position(x), axis=1) data_merged['pos_tag'] = data_merged.apply( lambda x: x.pos_label[x.position][0], axis=1) #data_merged['entity'] = data_merged.apply(lambda x: x.entities_sent[x.position], axis=1) data_merged['len_sentence'] = data_merged.parallel_apply( lambda x: len(x.sentence), axis=1) data_merged['len_token'] = data_merged.parallel_apply( lambda x: len(x.token), axis=1) data_merged['count_senses'] = data_merged.apply( lambda x: sum([len(wn.synsets(w)) for w in x.token.split(' ')]), axis=1) data_merged['count_tags'] = data_merged.parallel_apply( lambda x: sum([len(brown_dict[w.lower()]) for w in x.token.split(' ')]), axis=1) data_merged['count_syllables'] = data_merged.parallel_apply( lambda x: syllables.estimate(x.token), axis=1) data_merged['count_morphemes'] = data_merged.parallel_apply(lambda x: sum( [len(Word(w, language='en').morphemes) for w in x.token.split(' ')]), axis=1) print('Counting ...') data_merged['count_after'] = data_merged.parallel_apply( lambda x: len(x.sentence.partition(x.token)[2].split(' ')), axis=1) data_merged['count_before'] = data_merged.parallel_apply( lambda x: len(x.sentence.partition(x.token)[0].split(' ')), axis=1) def get_features_from_corpus(row): count_lang_8 = lang_8_corpus.count(row.token) count_tatoeba = tatoeba.count(row.token) count_cbt = cbt_corpus.count(row.token) count_normal_wiki = normal_wiki.count(row.token) count_simple_wiki = simple_wiki.count(row.token) return pd.Series([ count_lang_8, count_lang_8 / len_lang_8_words, count_tatoeba, count_tatoeba / len_tatoeba, count_cbt, count_cbt / len_cbt, count_normal_wiki, count_normal_wiki / len_normal_wiki, count_simple_wiki, count_simple_wiki / len_simple_wiki ]) print('Generating features from corpus ...') data_merged[[ 'count_lang_8', 'freq_lang_8', 'count_tatoeba', 'freq_tatoeba', 'count_cbt', 'freq_cbt', 'count_normal_wiki', 'freq_normal_wiki', 'count_single_wiki', 'freq_single_wiki' ]] = data_merged.parallel_apply(lambda x: get_features_from_corpus(x), axis=1) data_merged['count_dep'] = data_merged.parallel_apply( lambda x: Counter(x.dep_target)[x.token], axis=1) data_merged['count_words'] = data_merged.parallel_apply( lambda x: x.token.count(' '), axis=1) def get_tags_features(row): lim_aux = row.position[0] - 8 if len(row.position) > 1: lim_sup = row.position[1] else: lim_sup = row.position[0] sentence_pre = ' '.join( row.sentence_pre[(0 if lim_aux < 0 else lim_aux):(lim_sup + 7)]) tags_cut_c = Counter( row.pos_label[(0 if lim_aux < 0 else lim_aux):(lim_sup + 5)]) count_nouns = tags_cut_c['NOUN'] if 'NOUN' in tags_cut_c else 0 count_verbs = tags_cut_c['VERB'] if 'VERB' in tags_cut_c else 0 ratio = (count_nouns / count_verbs) if count_nouns != 0 and count_verbs != 0 else 0 return pd.Series([ ratio, tags_cut_c['PROPN'] if 'PROPN' in tags_cut_c else 0, count_nouns, tags_cut_c['ADV'] if 'ADV' in tags_cut_c else 0, count_verbs, tags_cut_c['PART'] if 'PART' in tags_cut_c else 0 ]) print('Generating tags features ...') data_merged[[ 'ratio', 'count_propn', 'count_noun', 'count_adv', 'count_verb', 'count_part' ]] = data_merged.parallel_apply(lambda x: get_tags_features(x), axis=1) def get_ngram_features(row): if len(row.position) > 1: pos_after = row.position[1] else: pos_after = row.position[0] pos_before = row.position[0] if pos_after + 1 < len(row.sentence_pre): tuple_after = (row.sentence_pre[pos_after], row.sentence_pre[pos_after + 1]) else: tuple_after = (row.sentence_pre[pos_after], '.') if pos_before - 1 >= 0: tuple_before = (row.sentence_pre[pos_before - 1], row.sentence_pre[pos_before]) else: tuple_before = ('.', row.sentence_pre[pos_before]) aux_features = [] aux_features.append(normal_wiki_ngram_2[tuple_after]) aux_features.append(simple_wiki_ngram_2[tuple_after]) aux_features.append(cbt_corpus_ngram_2[tuple_after]) aux_features.append(normal_wiki_ngram_2[tuple_before]) aux_features.append(simple_wiki_ngram_2[tuple_before]) aux_features.append(cbt_corpus_ngram_2[tuple_before]) aux_features.append(normal_wiki_ngram_3[tuple_after]) aux_features.append(simple_wiki_ngram_3[tuple_after]) aux_features.append(cbt_corpus_ngram_3[tuple_after]) aux_features.append(normal_wiki_ngram_3[tuple_before]) aux_features.append(simple_wiki_ngram_3[tuple_before]) aux_features.append(cbt_corpus_ngram_3[tuple_before]) return pd.Series(aux_features) print('Generating ngram features ...') data_merged[[ 'count_ngram_2_simple_wiki_after', 'count_ngram_2_normal_wiki_after', 'count_ngram_2_cbt_corpus_after', 'count_ngram_2_simple_wiki_before', 'count_ngram_2_normal_wiki_before', 'count_ngram_2_cbt_corpus_before', 'count_ngram_3_simple_wiki_after', 'count_ngram_3_normal_wiki_after', 'count_ngram_3_cbt_corpus_after', 'count_ngram_3_simple_wiki_before', 'count_ngram_3_normal_wiki_before', 'count_ngram_3_cbt_corpus_before' ]] = data_merged.apply(lambda x: get_ngram_features(x), axis=1) return data_merged.drop(['sentence', 'token', 'token_l'], axis=1)
print("amount of adjectives") print(len(a)) print("most common verbs") b = Counter(verbs) print(b.most_common(20)) print("amount of verbs") print(len(b)) #init lists for adjective types: positive_adjectives = [] neutral_adjectives = [] negative_adjectives = [] #find out adjective sentiment and append to lists accordingly for adjective in adjectives: w = Word(adjective, language="fi") if (w.polarity == -1): negative_adjectives.append(w) elif (w.polarity == 1): positive_adjectives.append(w) else: neutral_adjectives.append(w) print("amount of positive adjectives") print(len(positive_adjectives)) print("amount of negative adjectives") print(len(negative_adjectives)) print("amount of neutral adjectives") print(len(neutral_adjectives)) pos_a = Counter(positive_adjectives) neg_a = Counter(negative_adjectives)
def getIndonesianMorphs(word): w = Word(word, language="id") return w.morphemes
# ## Named Entity Recognition text = Text(u"In Großbritannien war Gandhi mit dem westlichen Lebensstil vertraut geworden") print(text.entities) # ## Polarity print("{:<16}{}".format("Word", "Polarity")+"\n"+"-"*30) for w in zen.words[:6]: print("{:<16}{:>2}".format(w, w.polarity)) # ## Embeddings word = Word("Obama", language="en") print("Neighbors (Synonms) of {}".format(word)+"\n"+"-"*30) for w in word.neighbors: print("{:<16}".format(w)) print("\n\nThe first 10 dimensions out the {} dimensions\n".format(word.vector.shape[0])) print(word.vector[:10]) # ## Morphology word = Text("Preprocessing is an essential step.").words[0] print(word.morphemes) # ## Transliteration
from polyglot.text import Word, Text words = "həmişə bütün hüquq normalarda hər üç element olmur".split(" ") for w in words: w = Word(w, language="az") print("{:<20}{}".format(w, w.morphemes)) """ həmişə ['həmişə'] bütün ['bütün'] hüquq ['hüquq'] normalarda ['norma', 'larda'] hər ['hər'] üç ['üç'] element ['element'] olmur ['olmur'] """ text = "həmişəbütünhüquqnormalardahərüçelementolmur" splitted_text = Text(text) splitted_text.language = "az" print(splitted_text.morphemes) """ ['həmişə', 'bütün', 'hüquq', 'norma', 'larda', 'hər', 'üç', 'element', 'olmur'] """
def morf(w): return Word(w, language=LANGUAGE).morphemes
def preprocess_data(sentence_lists, pre_defined, meta, lang, dataset_type, missing_embed_to_zeros): # second pass: embeds = [] max_len = meta['maxlen'] count = 0 too_long = 0 errors = 0 no_embedding = 0 strange_id = 0 for sentences in sentence_lists: for sentence in sentences: count += 1 if len(sentence) > max_len - 1: # first token will denote language # print(sentence) too_long += 1 continue try: # first token is the root token for the dependency tree, also encodes the language this_sentence = { 'token': [pre_defined[lang]], 'head': [pre_defined[lang]], 'upos': [pre_defined[lang]], 'deprel': [pre_defined[lang]] } #go through the sentence, discarding all the tokens with composite id tokens = [] for tok in sentence: if tok.head is None: continue try: int(tok.id) # check if that fails tokens.append(tok) except: continue for t, token in enumerate(tokens): try: assert int( token.id ) == t + 1, "token.id must equal t+1, instead got " + token.id + ", t=" + t assert int(token.head) <= len(sentence) except: strange_id += 1 raise ValueError("strange id") word = Word(token.form, language=lang) try: word_vector = word.vector except: no_embedding += 1 if missing_embed_to_zeros: word_vector = np.zeros(256, dtype=np.float32) else: raise ValueError("no embedding") if 'embed' not in this_sentence: this_sentence['embed'] = [np.zeros_like(word_vector)] this_sentence['embed'].append(word_vector) this_sentence['token'].append( meta['emb_index'][lang][token.lemma]) this_sentence['head'].append(int(token.head)) this_sentence['upos'].append(meta['upos'][token.upos]) this_sentence['deprel'].append( meta['deprel'][token.deprel]) this_sentence_nice = { key: torch.tensor(pad(val, max_len)) for key, val in this_sentence.items() if key != 'embed' } pad_embed = pad(this_sentence['embed'], max_len, np.zeros_like(this_sentence['embed'][0])) pad_embed_nice = torch.from_numpy(np.array(pad_embed)) this_sentence_nice['embed'] = pad_embed_nice embeds.append(this_sentence_nice) except ValueError as e: errors += 1 continue if count > 0: print('kept ', len(embeds) / count, ' of all sentences') else: print("no valid sentences at all - what's going on here?") print('total', count, ', too long', too_long, ', no_embedding', no_embedding, ', strange ids', strange_id, ', total errors', errors) meta['stats'][lang][dataset_type] = OrderedDict() meta['stats'][lang][dataset_type]['orig_size'] = count meta['stats'][lang][dataset_type]['size'] = len(embeds) return embeds
def get_morphemes(word): """ Returns a list of morphemes of the given word. """ return ' '.join([str(m) for m in Word(word, language='en').morphemes])
def stem4(word): w = Word(word, language="tr") return w.morphemes[0]