def lemmatize_corpus(liste, threshold, exclusion=True): """ :param list: corpus_words soir la liste des mots des lyrics :return: list de mots lemmatizée des lyrics """ words = [] corpus_words = liste word_frequency = {} lemmas_words = [] for text in corpus_words: for token in text: if token in word_frequency: word_frequency[token] += 1 else: word_frequency[token] = 1 #print("word frequency : ",word_frequency) for text in corpus_words: for token in text: if exclusion: if word_frequency[token] > threshold: words.append(token) lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) lemmas = lemmatizer(token, u'NOUN') lemmas_words.append(" ".join(lemmas)) else: lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) lemmas = lemmatizer(token, u'NOUN') # print("token : ",token," Lemmatization : ",lemmas) lemmas_words.append(" ".join(lemmas)) #print("lemmas words : \n", lemmas_words) return lemmas_words
def __init__(self, disable: list = None, stopwords: list = None, batch_size: int = None, ngram_range: Tuple[int, int] = None, lemmas=False, lowercase: bool = None, alphas_only: bool = None): """ :param disable: pipeline processors to omit; if nothing should be disabled, pass an empty list :param stopwords: a set of words to skip :param batch_size: a batch size for internal spaCy multi-threading :param ngram_range: range for producing ngrams, ex. for unigrams + bigrams should be set to (1, 2), for bigrams only should be set to (2, 2) :param lemmas: weather to perform lemmatizing or not while tokenizing, currently works only for the English language :param n_threads: a number of threads for internal spaCy multi-threading """ if disable is None: disable = ['parser', 'ner'] self._stopwords = stopwords or [] self.model = spacy.load('en', disable=disable) self.tokenizer = Tokenizer(self.model.vocab) self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES, LOOKUP) self.batch_size = batch_size self.ngram_range = ngram_range self.lemmas = lemmas self.lowercase = lowercase self.alphas_only = alphas_only
def solve(word_list): wnl = stem.WordNetLemmatizer() porter = stem.porter.PorterStemmer() a = [porter.stem(word) for word in word_list] b = [wnl.lemmatize(word) for word in word_list] lemmatizer = Lemmatizer() c = [lemmatizer.lookup(word) for word in word_list] res = {} res['a'] = a res['b'] = b res['c'] = c return res
def __init__(self, filter_score=DEFAULT_FILTER_CONTEXT_SCORE, proxy_server=DEFAULT_PROXY_SERVER): self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) self.wikipedia_cache = {} self.fetcher = AsyncWikiSearcher(proxy_server) self.graph_data = GraphData() self.wikidata_property_table = WikiDataPropertyTable.get_instance() self.embedding = {} self.filter_score = filter_score self.NLP = SpacyNLPFactory.create_simple_nlp_pipeline() self.all_domain_vector = {}
def lemmatize(self, *args, **kwargs): if not self._lemmatizer: from spacy.lemmatizer import Lemmatizer from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES self._lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) return self._lemmatizer(*args, **kwargs)[0]
def task_three(doc1): tokens = list("") grammar_tags = dict() part_of_speech_tags = list() depend_tags = dict() lemmas = dict() for tokk in doc1: tokens.append(tokk.text) grammar_tags[tokk] = tokk.tag_ part_of_speech_tags.append(tokk.pos_) if tokk.dep_ in depend_tags.keys(): value_set = depend_tags[tokk.dep_] value_set.add(tokk) else: value_set = set() value_set.add(tokk) depend_tags[tokk.dep_] = value_set lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) for i in range(0, len(tokens)): lemmas[tokens[i]] = lemmatizer(tokens[i], part_of_speech_tags[i]) print("Tokenize") print(tokens) print() print("Lemmatize") print(lemmas) print() print("POS Tags") print(grammar_tags) print() print("Dependency Parse Tree") print(depend_tags) print() for tokk in tokens: syn = wn.synsets(tokk) hypernym = list("") hyponym = list("") holonym = list("") meronym = list("") for synset in syn: hypernym.append(synset.hypernyms()) hyponym.append(synset.hyponyms()) holonym.append(synset.part_holonyms()) meronym.append(synset.part_meronyms()) print(tokk) print() print("Hypernyms") print(hypernym) print() print("Hyponyms") print(hyponym) print() print("Holonyms") print(holonym) print() print("Meronyms") print(meronym) print()
def __init__(self, inferenceEngine, colorFile="corpora/colors.csv", sizeFile="corpora/sizes.txt", shapeFile="corpora/shapes.txt", nerModel="models/nerModel"): self.query = "" self.nlp = spacy.load('en') ner = spacy.load(nerModel).pipeline[0][1] self.nlp.replace_pipe("ner", ner) self.inferenceEngine = inferenceEngine self.matcher = Matcher(self.nlp.vocab) self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) self.scene = { "objects": [], "backgrounds": [] } self.subjects = {} self.referenceWords = ["the", "it", "that", "his", "hers", "theirs"] self.colors = {} with open(colorFile, "r") as colorReader: for line in colorReader: colorValue = line.split(",") self.colors[colorValue[0].lower()] = colorValue[1].strip("\n") self.sizes = {} with open(sizeFile, "r") as sizeReader: for line in sizeReader: line = line.strip().lower() sizeValue = line.split(",") self.sizes[sizeValue[0]] = sizeValue[1].strip("\n") self.shapes = [] with open(shapeFile, "r") as shapeReader: self.shapes = [shape.strip().lower() for shape in shapeReader]
def wsd_level(sentence, word): lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) bert = (model, tokenizer) doc = nlp(sentence.lower()) words = [] pos_tags = [] for token in doc: words.append(token.text) pos_tags.append(token.pos_) idx = [i for i, w in enumerate(words) if w == word] wsd_results = wsd_bert(words, idx, pos_tags, lemmatizer=lemmatizer, bert=bert) # print(wsd_results) wsd_scores = [ score for example_info in wsd_results for example, level, score in example_info ] wsd_levels = [ level for example_info in wsd_results for example, level, score in example_info ] # print(wsd_scores.index(max(wsd_scores))) return wsd_levels[wsd_scores.index(max(wsd_scores))]
def __init__(self, process, print_mode="single"): self.process = process self.print_mode = print_mode logger = logging.getLogger("program") logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s", ' '.join(sys.argv)) self.logger = logger self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) self.user = getpass.getuser() if os.path.isdir("/media/" + self.user + "/Data4/ROSA/db/"): self.data_dir = "/media/" + self.user + "/Data4/ROSA/db/" self.save_dir = "/media/" + self.user + "/Data4/ROSA/graph/" else: self.data_dir = "/home/" + self.user + "/Data4/ROSA/db/" self.save_dir = "/home/" + self.user + "/Data4/ROSA/graph/" if self.process == "update": self._convert_raw_data_to_graph() # elif self.process == "run": self._load_graph() self.logger.info("ROSA is now live..") while (1): text = raw_input( "ROSA says: What do you want to know about? ") # Python 2 self._query(text)
def __init__(self, train_corpus: TextIO): """Initialize and train the model. Args: train_corpus: A open file or text stream with annoted text for trainiing of the model. """ lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) self.coocurrenz = defaultdict(lambda: defaultdict(int)) self.all_verbs = set() self.all_nouns = set() for line in train_corpus: verbs = set() nouns = set() for word_pos in line.split(): (word, pos) = str(word_pos).split('_') pos_type = pos[0] if (pos_type != 'V') and (pos_type != 'N'): continue # nothing to do with the word word = word[2:] if pos_type == 'V': verb = lemmatizer(word, u'VERB')[0] verbs.add(verb) elif pos_type == 'N': noun = lemmatizer(word, u'NOUN')[0] nouns.add(noun) self.all_nouns.update(nouns) self.all_verbs.update(verbs) for noun in nouns: for verb in verbs: self.coocurrenz[noun][verb] += 1 self.all_nouns = sorted(self.all_nouns) self.all_verbs = sorted(self.all_verbs)
def concept_sets(self, value): """ Sets concepts_sets and the attributes derived from it. Args: value (list of list of str): A list of lists of strings; each string being a concept, each set in the larger list corresponding to a document which has the tags seen in the set. """ self._concept_sets = value LOG.debug("Extracting raw keywords as concepts.") all_concepts = [ concept for concept_set in tqdm(self._concept_sets) for concept in concept_set if concept.strip() != "" ] raw_concepts = set(all_concepts) LOG.debug("Lemmatizing {} raw concepts.".format(len(raw_concepts))) concepts = [c.lower() for c in raw_concepts] self.raw2lemma = {rc: c for rc, c in zip(raw_concepts, concepts)} lookups = Lookups() lookups.add_table("lemma_lookup", self.raw2lemma) self.lemmatizer = Lemmatizer(lookups) self.lemma2raw = {v: k for k, v in self.raw2lemma.items()} lemma_concepts = [ self.lemmatizer(concept, "NOUN")[0] for concept in all_concepts ] self.concepts_frequencies = Counter(lemma_concepts) self.concepts = set(lemma_concepts) self._fit_concept_indices()
def lemmatize(descriptions): tqdm_notebook().pandas() nlp = spacy.load('en') lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) return descriptions.progress_apply(lambda desc: \ ' '.join([lemmatizer(token.text, token.pos_)[0] \ for token in nlp(desc)]))
def lemmatizer(): lookups = Lookups() lookups.add_table("lemma_lookup", { "dogs": "dog", "boxen": "box", "mice": "mouse" }) return Lemmatizer(lookups)
def __init__(self, model="en_core_web_lg", testing=False): print("Loading {}...".format(model)) self.nlp = spacy.load(model) self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) self.title_similarity = 0.90 print("{} loaded.".format(model)) if not testing: super().__init__()
def __init__(self): self.nlp = StanfordCoreNLP('http://localhost', port=9000) self.spacy = spacy.load('en_core_web_sm') self.rewritten = None self.sentences = None self.text2ent = defaultdict(None) self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) self.adjResolver = AdjectiveResolver() self.verbResolver = VerbResolver()
def lemmatization(tweet, nlp): """Returns the lemma of the tweet.""" lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) tweet = nlp(tweet) lemmatized = [ lemmatizer(word.text.lower(), word.pos_)[0] for word in tweet ] return " ".join(lemma for lemma in lemmatized)
def lemma_wordlist(word_list): lemmatizer = lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) Lemma_list = [] for word in word_list: lWord = lemmatizer(word, u"NOUN") Lemma_list.append(lWord[0]) return Lemma_list
def get_lemmatizer(): if not _spacy['lemmatizer']: from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES from spacy.lemmatizer import Lemmatizer lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) _spacy['lemmatizer'] = lemmatizer else: lemmatizer = _spacy['lemmatizer'] return lemmatizer
def __init__(self, recognizer, source): self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) self.engine = pyttsx3.init() self.engine.setProperty( 'voice', 'com.apple.speech.synthesis.voice.ava.premium') self.engine.setProperty('rate', self.engine.getProperty('rate') - 31.5) self.nlp = spacy.load("en_core_web_sm") self.recognizer = recognizer self.source = source
def lemmatize(sent, nlp): lemmatized = [] lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) doc = nlp(str(sent)) for token in doc: i = lemmatizer(str(token), token.pos_) # print(i) lemmatized.append(i[0]) return ' '.join(lemmatized)
def test_issue595(): """Test lemmatization of base forms""" words = ["Do", "n't", "feed", "the", "dog"] tag_map = {"VB": {POS: VERB, VerbForm_inf: True}} rules = {"verb": [["ed", "e"]]} lemmatizer = Lemmatizer({"verb": {}}, {"verb": {}}, rules) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) doc = Doc(vocab, words=words) doc[2].tag_ = "VB" assert doc[2].text == "feed" assert doc[2].lemma_ == "feed"
def test_issue1387(): tag_map = {"VBG": {POS: VERB, VerbForm_part: True}} index = {"verb": ("cope", "cop")} exc = {"verb": {"coping": ("cope", )}} rules = {"verb": [["ing", ""]]} lemmatizer = Lemmatizer(index, exc, rules) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) doc = Doc(vocab, words=["coping"]) doc[0].tag_ = "VBG" assert doc[0].text == "coping" assert doc[0].lemma_ == "cope"
def test_issue1387(): tag_map = {"VBG": {POS: VERB, VerbForm_part: True}} lookups = Lookups() lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope", )}}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) lemmatizer = Lemmatizer(lookups) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) doc = Doc(vocab, words=["coping"]) doc[0].tag_ = "VBG" assert doc[0].text == "coping" assert doc[0].lemma_ == "cope"
def lemmatize(data): output = [] lemmatizerEn = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) lemmatizerEs = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES, lookup=spacy.lang.es.LOOKUP) bar = ChargingBar('Lemmatizing\t\t\t\t', max=len(data)) for instance in data: new_tweet = {} new_tweet['tweetid'] = instance['tweetid'] new_tweet['tweet'] = instance['tweet'] new_tweet['tokens'] = [] new_tweet['langid'] = instance['langid'] new_tweet['sentiment'] = instance['sentiment'] for i, word in enumerate(instance['tokens']): if (instance['langid'][i] == 'lang1'): new_tweet['tokens'].append(lemmatizerEn.lookup(word)) elif (instance['langid'][i] == 'lang2'): new_tweet['tokens'].append(lemmatizerEs.lookup(word)) else: new_tweet['tokens'].append(word) # new_tweet['tokens'].append(lemmatizerEn.lookup(word)) output.append(new_tweet) new_tweet = {} new_tweet['tweetid'] = instance['tweetid'] new_tweet['tweet'] = instance['tweet'] new_tweet['tokens'] = [] new_tweet['langid'] = [] new_tweet['sentiment'] = instance['sentiment'] bar.next() bar.finish() return output
def __init__(self): self.entities = [] self.columns = [] self.relationships = [] self.synonyms_col = [] self.synonyms_tab = [] self.entity_graph = [] self.loaded_entities = [] self.config = Configuration() self.conn = pyodbc.connect(self.config.get_sql_connection_string()) lookups = Lookups() self.lemmatizer = Lemmatizer(lookups) self.load_db_model()
def get_cleaned_text(self): ''' This function clean the text ''' f = gzip.open(self.path, 'rb') self.text = f.read().decode('utf-8') #removing stop words and words with only one character nlp = spacy.load("en_core_web_sm") stop_words = set(nlp.Defaults.stop_words) self.text = self.text.lower().split(' ') self.text = [ word for word in self.text if word not in stop_words if len(word) > 1 ] #lemmatizing the words lemmatizer = Lemmatizer() self.text = [lemmatizer.lookup(word) for word in self.text] return self.text
def word_lemmatizer(data): lemmatizer = Lemmatizer(lookup=LOOKUP) for doc in data: doc["lemma_paragraphs"] = [] for i, paragraph in enumerate(doc["stopped_paragraphs"]): doc["lemma_paragraphs"].append([]) doc["lemma_paragraphs"][i] = [] for k, sentence in enumerate(paragraph): doc["lemma_paragraphs"][i].append([]) for idx, word in enumerate(sentence): doc["lemma_paragraphs"][i][k].append( lemmatizer(word, u"NOUN")[0]) return data
def lemmatizer(d): lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) doc = nlp(d) str_doc = '' for tok in doc: # rem a lemmatized string is returned as a list # so it must be indexed to be 'unpacked' for string comparison if tok.text != lemmatizer(tok.text, tok.pos_)[0] and tok.pos_ == 'NOUN': #print(tok.text, lemmatizer(tok.text, tok.pos_), tok.pos_, tok.tag_, '\n') str_doc = str_doc + str(lemmatizer(tok.text, tok.pos_)[0]) + ' ' else: str_doc = str_doc + tok.text + ' ' return str_doc
def test_issue595(): """Test lemmatization of base forms""" words = ["Do", "n't", "feed", "the", "dog"] tag_map = {"VB": {POS: VERB, VerbForm_inf: True}} lookups = Lookups() lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]}) lookups.add_table("lemma_index", {"verb": {}}) lookups.add_table("lemma_exc", {"verb": {}}) lemmatizer = Lemmatizer(lookups) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) doc = Doc(vocab, words=words) doc[2].tag_ = "VB" assert doc[2].text == "feed" assert doc[2].lemma_ == "feed"
def lemmatize_word(word): """ Assign the base form of words Input: word (plural, capital letters) return: word (singular, lower letters) """ palabra = word palabra = Word(str(palabra).lower()) palabra = palabra.lemmatize() if palabra == word: lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) palabra = lemmatizer(word, u'NOUN') palabra = palabra[0] return palabra
def lemmatizer(): return Lemmatizer.from_dir(path.join(LOCAL_DATA_DIR))
def lemmatizer(path): if path is not None: return Lemmatizer.load(path) else: return None
def lemmatizer(package): return Lemmatizer.from_package(package)