def return_dim_sent(sent): """ Takes in a string sentence and checks if there are nouns in that sentence. If there are, it returns the sentence with the noun in their diminutive form. :param sent: a string containing a sentence. :return: a string containging the sentence with the nouns turned to diminutives. :rtype: str """ parsed = parse(sent, tokenize = True, tags = True, chunks = False) new_sent = [] for word, pos in parsed.split()[0]: if pos == 'NN' and not word.endswith('je'): # If the word is a noun... dim = dg.generate_diminutive(word) if new_sent[-1] == 'de': new_sent[-1] = 'het' # correcting for article. Not perfect though. new_sent.append(dim) elif pos == 'NNS' and not word.endswith('jes'): # If the word is a noun in plural... root = singularize(word) dim = dg.generate_diminutive(root) new_sent.append(dim + "s") else: new_sent.append(word) return " ".join(new_sent)
def return_dim_sent(sent): """ Takes in a string sentence and checks if there are nouns in that sentence. If there are, it returns the sentence with the noun in their diminutive form. :param sent: a string containing a sentence. :return: a string containging the sentence with the nouns turned to diminutives. :rtype: str """ parsed = parse(sent, tokenize=True, tags=True, chunks=False) new_sent = [] for word, pos in parsed.split()[0]: if pos == 'NN' and not word.endswith('je'): # If the word is a noun... dim = dg.generate_diminutive(word) if new_sent[-1] == 'de': new_sent[ -1] = 'het' # correcting for article. Not perfect though. new_sent.append(dim) elif pos == 'NNS' and not word.endswith( 'jes'): # If the word is a noun in plural... root = singularize(word) dim = dg.generate_diminutive(root) new_sent.append(dim + "s") else: new_sent.append(word) return " ".join(new_sent)
def wordvarieties(word): lem = lemma(word) pre = predicative(word) att = attributive(word) sin = singularize(word) con = conjugate(word, PRESENT, 1, SG) return [lem, pre, att, sin, con]
def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet i, n = 0, 0 for pred, attr, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")): if nl.singularize(pl) == sg: i += 1 n += 1 self.assertTrue(float(i) / n > 0.88) print("pattern.nl.singularize()")
def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet i, n = 0, 0 for pred, attr, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")): if nl.singularize(pl) == sg: i +=1 n += 1 self.assertTrue(float(i) / n > 0.88) print "pattern.nl.singularize()"
def run_lemmatization(entity_dict): lemma_dict = {} for ent in entity_dict: lemma_ent = singularize(ent) #lemma_ent = lemma(ent) if lemma_ent.replace(" ", "") == "": lemma_ent = ent if not lemma_ent in lemma_dict: lemma_dict[lemma_ent] = [ent] else: old = lemma_dict[lemma_ent] lemma_dict[lemma_ent] = old + [ent] return lemma_dict
def run_lemma_lookup(lemma_dict, unmatched_entities, entity_dict): updated_dict = {} with open('lemmatized_overview.txt', 'w') as fh: for entity in unmatched_entities: if len(entity) >= 4: #more error prone for smaller words lemma_entity = singularize(entity) #lemma_entity = lemma(entity) if lemma_entity in lemma_dict: if len(lemma_dict[lemma_entity]) == 1: entity_match = lemma_dict[lemma_entity][0] cat = entity_dict[entity_match] if len(entity_match) >= 4: updated_dict[entity] = cat fh.write(entity + "\t" + entity_match + "\t" + cat + "\n") fh.close() return updated_dict
def prepare_text_nl(row): """ Prepares dutch text by doing the following: * Lemmatize a word * Singularize a word * Predicative a word Parameters: ----------- row : pandas dataframe A row of a pandas dataframe Returns: -------- new_message : pandas dataframe A row of a pandas dataframe """ try: message = split(parse(row.Message_Only_Text)) except: print(row.Message_Only_Text) new_message = '' for sentence in message: for word, tag in sentence.tagged: if (tag == 'MD') | ('VB' in tag): new_message += lemma(word) + ' ' elif tag == 'NNS': new_message += singularize(word) + ' ' elif 'JJ' in tag: new_message += predicative(word) + ' ' else: new_message += word + ' ' return new_message
def return_mods(words_found, path_to_db): """ This functions should find the words and their modifier using Ruben's terminology extractor. For now this function only works with the first words found in WordNet by search_in_dwn :param words_found: list of words that are added to a xml pattern file. :type words_found: list :return container: a container of words and the output of the terminology etractor and the word2vec model search of the words in words_found :rtype: dictionairy """ top = Element('patterns') comment = Comment('Pattern file for terminology extractor') top.append(comment) #ALREADY SET-UP STORAGE FOR LATER USAGE container = {} for word in words_found: container[word] = defaultdict( list) #INIT DEFAULTDICT TO STORE MODIFIERS child = SubElement(top, 'pattern', {'len': "2"}) child.len = "2" ## ONLY SEARCHES FOR A N PATTERNS. IS THE REASON NOT ALL TERMS ARE FOUND AS ENTRY IN RETURNED DICT ## CAN ADD PATTERNS HERE SubElement(child, 'p', {"key": "pos", "position": "0", "values": "a"}) SubElement(child, 'p', { "key": "tokens", "position": "1", "values": word }) #STORE PATTERNS FILE if not os.path.isdir('patterns'): os.mkdir('patterns') logging.info("{} writing pattern file".format(time.strftime('%H:%M:%S'))) file_name = os.path.abspath('.') + '/patterns/xml_pattern-{}.xml'.format( time.strftime('%d-%m-%y-%H:%M:%S')) with open(file_name, 'w', 0) as f: #0 is for not buffering f.write(prettify(top).encode('utf8')) ## CALL THE TERMINOLOGY EXTRACTOR WITH THE NEWLY CREATED PATTERNS cmd = ' '.join( ['python', CMD_EXTRACTOR_SCRIPT, '-d', path_to_db, '-p', file_name]) logging.info(cmd) logging.info("{} calling terminology extractor".format( time.strftime('%H:%M:%S'))) process = Popen(cmd, stdout=PIPE, shell=True) output, err = process.communicate() ##STORE ALL THE TERMS AND THEIR MODIFIERS IN A DICTIONAIRY for term_element in [line.split() for line in output.split('\n') if line]: freq, mod, term = term_element # print freq, term, word try: container[term]['modifiers'].append((mod, freq)) except KeyError: print >> sys.stderr, "not found in container: {}".format(term) for entry_term in container.keys(): try: most_similar_words = model.most_similar(entry_term) except KeyError: print >> sys.stderr, "not found in model: {}".format(entry_term) continue singularized = [singularize(w) for w in zip(*most_similar_words)[0]] container[entry_term]['similar'].extend(singularized) if not entry_term in container[entry_term][similar]: #put search word in results. container[entry_term]['similar'].append(entry_term) return container