def delete_adj_adv(text): tokenized = sent_tokenize(text) good_words = [] old = len(text.split(' ')) for index in tokenized: words_list = nltk.word_tokenize(index) tagged = Tagger( language='ro' ) # Using a Tagger. Which is part-of-speech tagger or POS-tagger. for words in words_list: typed = tagged.tag(words) if typed[0][1] != 'ADJ' and typed[0][1] != 'ADV': good_words.append(typed[0][0]) else: last_word = good_words[len(good_words) - 1] if get_type(last_word)[0][1] == "ADP" or get_type( last_word)[0][1] == "DET": good_words.append(typed[0][0]) new_text = "" for words in good_words: if words in ".,?!": new_text = new_text + words else: new_text = new_text + " " + words return new_text, old - len(new_text.split(' '))
def get_stem(tokens): """ Gets the root of words by tagging them syntactically and then using WordNet lemmatizer. @param tokens: list containing morphologically rich words @return: A list of words stripped to their root """ tagger = Tagger(language="en") #creates Ripple Tagger tagged = tagger.tag(tokens) #creates list of tokens with tags lemmat = WordNetLemmatizer() #creates instance of lemmatizer stemmed = [] for token, tag in tagged: wordnettag = get_tag(tag) if (get_tag(tag) == 'OTHER' ): #words with unknown tags are added as they are stemmed.append([token, tag]) else: stem = lemmat.lemmatize(token, wordnettag) stemmed.append([stem, wordnettag]) return stemmed #if __name__ == "__main__": #getStem()
def text_separator(): text_dict = {"paragraph": [], "sentences": [], "analyses": []} tagger = Tagger(language="ro") text_support = "Karl Benz dar nici așa Cristi , Marea Neagra nu fu mulțumit Motorwagen. Se duse în curte și aduse un băț cu care o bătu zdravăn, iar biata " \ "fată plângea de se scutura cămașa pe ea. - Unchiule, unchiule, țipa ea, cu ce sunt eu de vină că " \ "lupul a dat iama în oi? Dar bărbatul cel crud nu cunoștea mila. - Să pleci din casa mea, strigă " \ "el în cele din urmă, ostenit de atâta bătaie, că nici nu putea să mai sufle. Să te duci de aici " \ "și să nu te mai întorci până ce nu-mi aduci oile înapoi!" mrep = lambda s, d: s if not d else mrep(s.replace(*d.popitem()), d) text_support = re.sub(r'\s+', ' ', text_support) for sign in ['.', '!', '?', '...', '[...]', ':', "'", ',', '"']: text_support = str.replace(text_support, sign, " " + sign) # separa propozitiile in functie de semnele de punctuatie text_support = mrep(text_support, dict_separator).split("###")[:-1] text_support = [ sentence[1:] if sentence[0] == " " else sentence for iterable, sentence in enumerate(text_support) ] for sentence in text_support: text_dict["sentences"].append(sentence) text_dict["analyses"].append(tagger.tag(sentence)) print(text_dict)
def test_swedish(self): tagger = Tagger(language="swedish") self.assertEqual( tagger.tag(u"Fördomen har alltid sin rot i vardagslivet"), [ (u'Fördomen', 'NOUN'), (u'har', 'VERB'), (u'alltid', 'ADV'), (u'sin', 'DET'), (u'rot', 'NOUN'), (u'i', 'ADP'), (u'vardagslivet', 'NOUN'), ])
def test_swedish_alternative(self): tagger = Tagger(language="swedish-2") self.assertEqual( tagger.tag(u"Fördomen har alltid sin rot i vardagslivet"), [ (u'Fördomen', 'NOUN'), (u'har', 'AUX'), # Wrong, but predicted using swedish-2 (u'alltid', 'ADV'), (u'sin', 'PRON'), # Wrong, but predicted using swedish-2 (u'rot', 'NOUN'), (u'i', 'ADP'), (u'vardagslivet', 'NOUN'), ])
def words_score(text): original = text separatori = [",", '.', '!', '?'] tokenized = sent_tokenize(text) top_sentences = {} dictionary = {} for i in tokenized: for index in separatori: i = i.replace(index, '') words_list = nltk.word_tokenize(i) tagged = Tagger(language='ro') for words in words_list: type = tagged.tag(words) if type[0][0] not in dictionary.keys(): if type[0][1] == 'PROPN': dictionary[type[0][0]] = 2 elif type[0][1] == 'VERB': dictionary[type[0][0]] = 1 elif type[0][1] == 'NOUN': dictionary[type[0][0]] = 0.5 else: dictionary[type[0][0]] = 0.1 else: if type[0][1] == 'PROPN': dictionary[type[0][0]] += 2 elif type[0][1] == 'VERB': dictionary[type[0][0]] += 1 elif type[0][1] == 'NOUN': dictionary[type[0][0]] += 0.5 else: dictionary[type[0][0]] += 0.1 for i in tokenized: copy = i for index in separatori: i = i.replace(index, '') words_list = nltk.word_tokenize(i) suma = 0 for words in words_list: if words in dictionary.keys(): suma = suma + dictionary[words] top_sentences[copy] = suma return top_sentences
def test_english(self): tagger = Tagger(language="en") self.assertEqual( tagger.tag(u"The quick brown fox jumps over the lazy dog ."), [ (u'The', u'DET'), (u'quick', u'ADJ'), (u'brown', u'ADJ'), (u'fox', u'NOUN'), (u'jumps', u'VERB'), (u'over', u'ADP'), (u'the', u'DET'), (u'lazy', u'ADJ'), (u'dog', u'NOUN'), (u'.', u'PUNCT'), ])
def test_french(self): tagger = Tagger(language="fra-1") self.assertEqual( tagger.tag( u"Cette annonce a fait l' effet d' une véritable bombe ."), [ (u'Cette', 'DET'), (u'annonce', 'NOUN'), (u'a', 'AUX'), (u'fait', 'VERB'), (u"l'", 'DET'), (u'effet', 'NOUN'), (u"d'", 'ADP'), (u'une', 'DET'), (u'véritable', 'ADJ'), (u'bombe', 'NOUN'), (u'.', 'PUNCT'), ])
def posTagging(): extractAbout() client = MongoClient() db = client.usersbot.testRecommendation tagger = Tagger(language="it") count=0 for r in db.find(): count=count+1 print("ristorante"+str(count)) id = r['_id'] tagADJ = [] tagNOUN = [] for el in tagger.tag(r['about']): if el[1] == 'ADJ': tagADJ.append(el[0]) if el[1] == 'NOUN': tagNOUN.append(el[0]) if len(tagADJ)>0 or len(tagNOUN)>0: db.update_one({"_id":id},{"$set": {"tagADJ": tagADJ,"tagNOUN":tagNOUN}}) else: db.remove({'_id':id}) client.close()
def __init__(self, doc_id, language, paragraph_list, doc_text): self.doc_id = doc_id self.language = language self.para_ids = paragraph_list self.doc_text = unicode(doc_text) self.word_tokens = re.findall(r'[\w\']+|[.,!?;]', self.doc_text) print 'Initialize class and tagging the content, language: {}'.format( self.language) self.paragraphes = [ self.word_tokens[x:x + 500] for x in xrange(0, len(self.word_tokens), 500) ] self.tagged_content = [] # tag each paragraph and add it to the tagged content for paragraph in self.paragraphes: self.tagged_content.append( Tagger(language=language).tag(' '.join(paragraph)))
def getTagWords(): client = MongoClient() db1 = client.usersbot.testRecommendation db2 = client.usersbot.tagWords db2.drop() db2.create_index([('word', TEXT)],unique=True) tagger = Tagger(language="it") count = 1 for r in db1.find(): #print(r) for adj in r['tagADJ']: try: db2.insert({'_id': count, 'word': adj,'type':'ADJ', 'count':1}) count =count+1 except errors.DuplicateKeyError as e: word = str(e).split("{ : ")[1].split(",")[0].replace("\"", "") try: id = db2.find({'$text': {'$search':word, '$diacriticSensitive': False}})[0]['_id'] except IndexError: id = db2.find({'$text': {'$search': adj, '$diacriticSensitive': False}})[0]['_id'] db2.update_one({"_id": id}, {"$inc": {"count":+1}}) for noun in r['tagNOUN']: try: db2.insert({'_id': count, 'word': noun,'type':'NOUN', 'count':1}) count =count+1 except errors.DuplicateKeyError as e: # coding=utf-8 #noun = noun.replace("å","à").encode().replace("xa1","xa0").decode() word = str(e).split("{ : ")[1].split(",")[0].replace("\"", "") try: id = db2.find({'$text': {'$search':word, '$diacriticSensitive': False}})[0]['_id'] except IndexError: id = db2.find({'$text': {'$search': noun, '$diacriticSensitive': False}})[0]['_id'] db2.update_one({"_id": id}, {"$inc": {"count": +1}}) client.close()
def eliminate_enumerations(sentences): """ This function eliminates enumerations from sentences :param sentences: the output from @process_text :param scores: the output from @assign_score_to_words :return: dict: keys: sentences with the eliminated enumerations if it's the case values: for each sentence, the word that had the highest score or None if no enumeration was found """ enum_regexp = re.compile( r'((\w+\-?\w+\s*\,\s*){2,100}\w+\-?\w+)|((\w+\-?\w+\s*\,\s*){1,100}\s*\w+\s+(si)\s+\w+)' ) enum_regexp_special_case = re.compile(r'((\w+\-?\w+\s*\,\s*){2,100})') tagger = Tagger(language="ro") tagged_sentences = tagger.tag(sentences) sentences = nltk.sent_tokenize(sentences) # finding the enumerations enumerations = list() for sentence in sentences: sent_enums = [ enum_regexp.findall(sentence), enum_regexp_special_case.findall(sentence) ] enumerations.append(sent_enums) # process the findall output and take only the full_match enum for i in range(0, len(enumerations)): if enumerations[i][0]: max_len = max([len(j) for j in enumerations[i][0][0]]) max_len_index = [ j for j in range(0, len(enumerations[i][0][0])) if len(enumerations[i][0][0][j]) == max_len ][0] enumerations[i][0] = enumerations[i][0][0][max_len_index] if enumerations[i][1]: max_len = max([len(j) for j in enumerations[i][1][0]]) max_len_index = [ j for j in range(0, len(enumerations[i][1][0])) if len(enumerations[i][1][0][j]) == max_len ][0] enumerations[i][1] = enumerations[i][1][0][max_len_index] # split the enumerations into tokens of words in tokenized_enums tokenized_enums = list() token_regex = re.compile(r"\w+-?\w*") for it in enumerations: if it != [[], []]: tokenized_enum = [ token_regex.findall(str(it[0])), token_regex.findall(str(it[1])) ] tokenized_enums.append(tokenized_enum) else: tokenized_enums.append([[], []]) # the output text new_text = '' # for each enumeartion for enumeration in range(0, len(enumerations)): # if they are not null if enumerations[enumeration] != [[], []]: # call the function that outputs the part of speech p_o_speech = get_part_of_speech_enum( tagged_sentences, tokenized_enums[enumeration][0]) # check if the words from each enumeartion are NOUN, ADJ or ADV count = 0 for enum_word in p_o_speech: if enum_word[1] == 'NOUN' or enum_word[ 1] == 'ADJ' or enum_word[1] == 'ADV' or enum_word[ 0].lower() == 'și' or enum_word[0].lower() == 'si': count += 1 # if they are then eliminate the enum from the sentence and put it in output text if count > 0 and count == len(p_o_speech): print(p_o_speech) best_score = max([globals.SCORES[i[0]] for i in p_o_speech]) best_word = [ i[0] for i in p_o_speech if globals.SCORES[i[0]] == best_score ][0] new_text += sentences[enumeration].replace( enumerations[enumeration][0], " " + best_word + " ") + " " globals.ENUMERATIONS_REMOVED.append( enumerations[enumeration][0]) # do the same thing again for the special case if the regular case didn't match else: if tokenized_enums[enumeration][1]: p_o_speech_special_case = get_part_of_speech_enum( tagged_sentences, tokenized_enums[enumeration][1]) count = 0 for enum_word in p_o_speech_special_case: if enum_word[1] == 'NOUN' or enum_word[ 1] == 'ADJ' or enum_word[ 1] == 'ADV' or enum_word[0].lower( ) == 'și' or enum_word[0].lower() == 'si': count += 1 # daca este enumeratie cs ce trebuie eliminata if count == len(p_o_speech_special_case): best_score = max([ globals.SCORES[i[0]] for i in p_o_speech_special_case ]) best_word = [ i[0] for i in p_o_speech_special_case if globals.SCORES[i[0]] == best_score ][0] new_text += sentences[enumeration].replace( enumerations[enumeration][1], " " + best_word + " ") + " " globals.ENUMERATIONS_REMOVED.append( enumerations[enumeration][1]) # if they are null then append to the key sentence the None value else: new_text += sentences[enumeration] + " " return new_text
def getFeatures(gelen): yazi = list(filter(('').__ne__, gelen)) # -----FEATURE 1 CÜMLE UZUNLUĞU -----# f1 = np.zeros(len(yazi)) for i in range(len(yazi)): cumleuzunluk = len(yazi[i].split()) f1[i]=cumleuzunluk f1 = f1/max(f1) #----FEATURE 2 CÜMLE KONUMU ----# f2 = np.zeros(len(yazi)) for i in range(len(yazi)): f2[i] = ((len(yazi) - yazi.index(yazi[i]))/len(yazi)) #----FEATURE 3 TERİM AĞIRLIĞI TF/ISF -----# f3 = np.zeros(len(yazi)) tfidf = TfidfVectorizer().fit_transform(yazi) for i in range(len(yazi)): f3[i] =(tfidf[i].sum()) f3 = f3/max(f3) # FEATURE 4 ÖZEL İSİM (PROPER NOUR) ----# f5 = np.zeros(len(yazi)) tagger = Tagger(language="english") for i in range(len(yazi)): sayi = len([item for item in tagger.tag(yazi[i]) if item[1] == 'NOUN']) sayi = sayi / len(yazi[i].split()) f5[i] = sayi #----FEATURE 5 TEMATİK KELİMELER ---# sw = getsw(); c = Counter([i for i in ' '.join(yazi).lower().split() if i not in sw]).most_common(5) tematikler = [item[0] for item in c] f6 = np.zeros(len(yazi)) for i in range(len(yazi)): f6[i]=len(set(yazi[i].lower().split())&set(tematikler)) /len(yazi[i].split()) #----FEATURE 6 numerik veriler ---# f7 = np.zeros(len(yazi)) for i in range(len(yazi)): f7[i] = len([int(s) for s in yazi[i].split() if s.isdigit()]) /len(yazi[i].split()) #---- FEATURE 7 Cümle Benzerlik Skoru ---# f8 = np.zeros(len(yazi)); tfidf = TfidfVectorizer().fit_transform(yazi) for i in range(len(yazi)): f8[i] = cosine_similarity(tfidf[i],tfidf).sum(); f8 = f8 / max(f8) sutunlar= ['f1_uzunluk','f2_konum','f3_tfisf','f4_özelisim','f5_tematik','f6_numerik','f7_benzerlik'] ind = []; for i in range(len(yazi)): ind.append(str(i)); data = np.array([f1,f2,f3,f5,f6,f7,f8]) Dframe = pd.DataFrame(data=data,index=sutunlar , columns=ind); dizi = Dframe.sum(axis=0).as_matrix() geridondur = [] for t in range(len(dizi)): geridondur.append((dizi[t],t)) #Dataframe ile doküman uzunluğu geri döndürülüyor ! return geridondur
def get_type(word): return Tagger(language='ro').tag(word)
def remove_dialog(text, alpha): text = re.sub(r'[ \t]*-', '-', text) final_text = "" i = 0 while i < len(text): if i == 0 and text[i] == "-": while i < len(text) and text[i] != '\n': i += 1 elif text[i] == "-" and text[i - 1] == "\n": while i < len(text) and text[i] != '\n': i += 1 else: final_text += text[i] i += 1 temp_text = clean_pre_text(text) tagger = Tagger(language="ro") original_len = len(temp_text) paragraphs = str.splitlines(temp_text) word_multiple_tags = dict() for paragraph in paragraphs: if paragraph == " " or paragraph == "": continue first_non_whitespace_position = 0 while first_non_whitespace_position < len(paragraph) and paragraph[ first_non_whitespace_position] in [" ", "\n", "\t"]: first_non_whitespace_position += 1 paragraph = paragraph[first_non_whitespace_position:] if paragraph[0] != "-": continue #print("-------------") paragraph = paragraph[1:] paragraph = re.sub(r'[-]', ' ', paragraph) temp_tags = tagger.tag(paragraph) add_words_using_class(paragraph) #TODO: DECOMMENT THIS AND TEST IT BEFORE PROD RELEASE right_tags = [] for it in temp_tags: if it[1] not in ["PUNCT", ""]: right_tags.append(it) for word, tag in right_tags: if word not in word_multiple_tags.keys(): word_multiple_tags[word] = dict() if tag not in word_multiple_tags[word].keys(): word_multiple_tags[word][tag] = 1 else: word_multiple_tags[word][tag] += 1 word_tag = dict() for word, tags_and_nr in word_multiple_tags.items(): nr_max = 0 nr_total = 0 real_tag = "CONJ" for tag, nr in tags_and_nr.items(): nr_total += nr if nr > nr_max: nr_max = nr real_tag = tag word_tag[word] = (real_tag, nr_total) #print(word_multiple_tags) #print("-------") #print(word_tag) for word, tag_nr in word_tag.items(): tag = tag_nr[0] nr = tag_nr[1] update_dict(word, tag, nr) new_len = len(final_text) dialog_len = original_len - new_len alpha_dialog_cut = dialog_len * 1.0 / original_len * 100 if int(alpha_dialog_cut) >= 99 - alpha or alpha_dialog_cut >= 100: new_alpha = 101 else: new_alpha = int(100 * alpha / (100 - alpha_dialog_cut)) return final_text, new_alpha
def __getattr__(name): if name == "SCORES": if _SCORES == {}: """ Acts as a property, will be used as "globals.SCORES[word_romanian]" Function that assigns a specific score to words based on their sentence parts proper noun = +4 score noun = +2 score verb = +2 score other = +1 score :param words: a dictionary for the words, where keys are the word in romanian and words[key] is the information about the respective word (output from 'find_singularity' function) :return: dictionary where each pair (key, value) will be (word, score_of_word) """ stop_words = nltk.corpus.stopwords.words('romanian') word_count, _, _ = find_singularity(ORIGINAL_TEXT) tagger = Tagger(language='ro') words_part_of_sent = dict() for sentence in nltk.sent_tokenize(ORIGINAL_TEXT): sentence = re.sub("[.,!?%^~$„”\"\']", "", sentence) sentence = re.sub(":", " ", sentence) sentence = tagger.tag(sentence) for word in sentence: word_in_ro = word[0] sentence_part = word[1] if word_in_ro not in words_part_of_sent.keys(): words_part_of_sent[word_in_ro] = defaultdict(lambda: 0) words_part_of_sent[word_in_ro][sentence_part] += 1 for word in words_part_of_sent.keys(): max_word_part_count = 0 word_part = "" if word.lower() in stop_words: _SCORES[word] = 1 continue if word in _SCORES.keys(): continue for part_of_sent in words_part_of_sent[word].keys(): count = words_part_of_sent[word][part_of_sent] if count > max_word_part_count: max_word_part_count = count word_part = part_of_sent if word_part in scores_points.keys(): _SCORES[word] = word_count[word] * scores_points[word_part] else: _SCORES[word] = word_count[word] * scores_points["OTHER"] return _SCORES
def tag(tokens): tagger = Tagger(language=config.LANG_CODE) return tagger.tag(' '.join(tokens))