def get_random_word_brown(): sentences = get_brown(semcor.sents('brown2/tagfiles/br-n12.xml')) list1 = semcor.tagged_chunks('brown2/tagfiles/br-n12.xml', 'pos') list2 = semcor.tagged_chunks('brown2/tagfiles/br-n12.xml', 'sem') random_word = [] temp_buffer = [] i = 0 for t in tuple(zip(list1, list2)): pos = t[0].label() lemma = t[0][0] if pos == "NN" and hasattr(t[1], 'label'): if hasattr(t[1].label(), 'synset'): synset = t[1].label().synset() if hasattr(synset, 'name'): temp_buffer.append((synset.name(), lemma)) s = sentences[i] eof = s[-1:] if lemma == eof: if len(temp_buffer) != 0: random_word.append(random.choice(temp_buffer)) else: random_word.append((' ', ' ')) temp_buffer = [] i += 1 return zip(random_word, sentences)
def get_semcor_corpus(): """builds a corpus of word frequencies using SemCor""" corpus = [] for sentence in semcor.sents(): sentence_proc = preprocess(' '.join(sentence)) for word in sentence_proc: corpus.append(word.lower()) word_freq = FreqDist(corpus) corpus_freqs = {} freqs, words = [], [] for word in corpus: freqs.append(word_freq[word]) words.append(word) # laplace smoothing freqs = np.array(freqs) freqs += 1 # compute inverse weighting N = len(word_freq) freqs = np.log((1 + N) / freqs) for freq, word in zip(freqs, words): corpus_freqs[word] = freq return (corpus_freqs)
def parse(self): tagged_sents = semcor.tagged_sents(tag='sense') sents = semcor.sents() # tagged_sents returns senses of each word/group of words for sent, tag in zip(sents, tagged_sents): word_idx = 0 for entry in tag: # check for no sense tag or multiword entries # TODO is it ok to exclude multiword entries? entry_len = len(entry.leaves()) if entry.label() and entry_len == 1 and type( entry.label()) != str: #import pdb; pdb.set_trace() entry = entry.label().synset().name().split('.') if len(entry) == 3: # check for (word.pos.nn) entry word, pos, sense = entry num_senses = self.count_senses(word) context = self.get_context(sent, word_idx) new_ex = Example(context, word, self.parse_sense(sense), pos, num_senses) # add to data set self.data.append(new_ex) # TODO for now just take first sense found in sentence break word_idx += entry_len # one entry might be multiple words
def __iter__(self): sents = semcor.sents() #subset of the Brown Corpus, tokenized, but not tagged or chunked for s in sents : # ss = ' '.join(list(s)) # temp = utils.simple_preprocess(' '.join((list(s)) yield utils.simple_preprocess(' '.join((list(s))))
def __init__(self, **kwargs): self._sents = [] self._tagged_sents = [] self._semcor_file_ids = self._load_semcor_file_ids() self._processor = kwargs.get('processor', lambda lexeme, definition, examples: (lexeme, definition, examples)) for file_id in self._semcor_file_ids: self._sents.append(semcor.sents(file_id)) self._tagged_sents.append(semcor.tagged_sents(file_id, 'both'))
def get_semcor_sentences(data_size): sentences, senses = [], [] for index in range(0, data_size): for node in semcor.tagged_sents(tag='both')[index]: node_noun = None # If node is a noun if isinstance(node.label(), Lemma) and node[0].label() == 'NN': node_noun = node break if node_noun: senses.append(node) sentences.append(" ".join(semcor.sents()[index])) return sentences, senses
def semcor_extraction() -> tuple: sentences = [] extracted = [] for i in range(0, 10): elem = list( filter( lambda sentence_tree: isinstance(sentence_tree.label(), Lemma) and sentence_tree[0].label() == "NN", semcor.tagged_sents(tag='both')[i])) if elem: extracted.append(random.choice(elem)) sentences.append(" ".join(semcor.sents()[i])) return sentences, extracted
def semcor_extraction(sentence_number=50): sentences = [] extracted = [] for i in range(0, sentence_number): # Estraiamo i nomi dalla frase i nouns = list(filter(lambda sentence_tree: isinstance(sentence_tree.label(), Lemma) and sentence_tree[0].label() == "NN", semcor.tagged_sents(tag='both')[i])) # Scegliamo un nome a caso della frase dalla lista nouns e lo estraiamo dalla frase i if nouns: lemma = select_lemma(nouns).label() extracted.append(lemma) sentence = " ".join(semcor.sents()[i]) sentences.append(remove_word(sentence, lemma.name())) return sentences, extracted
def loadSemcorSections(self): """ Loads semcor sections into two lists one of just senteces and one of tagged sentences. Returns: A dictionary with keys 'chunks' and 'sentences' with values of a list tagged semcor sentences and a list of untagged semcor sentences. """ sentencesGroupedBySense = defaultdict(list) listOfFileIds = semcor.fileids() listOfChunks = [] listOfSentences = [] for fileId in listOfFileIds: listOfChunks.append(semcor.tagged_sents(fileId, 'both')) listOfSentences.append(semcor.sents(fileId)) listOfChunks = self.removeLevelsOfListWithinList(listOfChunks) listOfSentences = self.removeLevelsOfListWithinList(listOfSentences) semcorData = {'chunks':listOfChunks, 'sentences':listOfSentences} return semcorData
def semcor_extraction(sentence_number: int = 50) -> tuple: """ Extracts `sentence_number` sentences from the semcore corpus. From each of them extracts also a random noun. :return: Returns a tuple (extracted sentences list, extracted nouns list) """ sentences = [] extracted = [] for i in range(0, sentence_number): elem = list( filter( lambda sentence_tree: isinstance(sentence_tree.label(), Lemma) and sentence_tree[0].label() == "NN", semcor.tagged_sents(tag='both')[i])) if elem: extracted.append(random.choice(elem)) sentences.append(" ".join(semcor.sents()[i])) return sentences, extracted
def process_semcor(): print 'semcor' from nltk.corpus import semcor count = 0 word = 'bank' sen1 = 'depository_financial_institution.n.01' sen2 = 'bank.n.01' file_name = 'data/bank_semcor_labelled_tmp.txt' for f in semcor.fileids(): sents = semcor.sents(f) tsents = semcor.tagged_sents(f, 'sem') for i in range(len(sents)): sent = sents[i] if (word in sent): if (sen1 in str(tsents[i])): appendToFile(file_name, sentToStr(sent, '+')) elif (sen2 in str(tsents[i])): appendToFile(file_name, sentToStr(sent, '-')) else: appendToFile(file_name, sentToStr(sent, '0')) count = count + 1 print count
if word in lemmas: lemmas[word][descriptor] = lemmas[word][descriptor] + 1 if descriptor in lemmas[word] else 1 else: lemmas[word] = {descriptor: 1} # this else statement prevents keyerror lookups on lemmas[word][synset] return word print("Importing Lemma and Synsets") lemmas = dict() # lemmas is a dict of dict, # lemmas[word] = dictionary of { synsets:frequency of synset when associated with a 'word' } # lemmas[word][synset] is a count of how many times a synset appears for each word # *** len(lemmas[word]) = the number of different senses a 'word' has in the corpus taggedsentences = semcor.tagged_sents(tag='both') # all sentences, fully tagged from SEMCOR plaintextsentences = semcor.sents() # all sentences from SEMCOR targetsentences = {} # sentences containing 'point' pos = dict() # list of part of speech tags from the corpus max_sentence_len = 0 lemmacount = {} # find all sentences including exactly 1 occurence of 'back' # not all of these sentences are related to the synsets we are looking for # e.g. goes back relates to the verb go instead of back for i, s in enumerate(plaintextsentences) : ss = ' '.join(list(s)) if ss.count(' back ') == 1:
try: from mpi4py import MPI comm = MPI.COMM_WORLD except ImportError: comm = None if feature: if feature == 'ngram': featurevocab = ngram_vocab(n) with open(corpusfile, 'r') as f: matrix, featurecounts, wordcounts = cooc_matrix((line.split() for line in f), featurevocab, vocab, n=n, unk=UNK, verbose=True, comm=comm) elif feature == 'synset': featurevocab = synset_vocab() matrix, featurecounts, wordcounts = cooc_matrix(semcor.sents(), featurevocab, vocab, doc2wnd=synset_context(iter(semcor.tagged_sents(tag='sem'))), unk=UNK, interval=100, verbose=True, wndo2=None) featurevocab = [synset.name() for synset in featurevocab] else: raise(NotImplementedError) if comm is None or not comm.rank: sp.save_npz(outputroot+'.npz', matrix) with open(outputroot+'.pkl', 'wb') as f: pickle.dump({'words': vocab, feature+'s': featurevocab, 'wordcounts': wordcounts, feature+'counts': featurecounts}, f) else: sys.exit() else: with open(corpusfile, 'r') as f: matrix, counts = symmetric_cooc_matrix((line.split() for line in f), vocab, unk=UNK, verbose=True, comm=comm)
from nltk.corpus import wordnet import nltk from nltk.tree import Tree from nltk.corpus.reader.wordnet import Lemma from nltk.corpus import semcor from nltk.corpus import wordnet noun = set(['NN', 'NNS', 'NNP', 'NNPS']) verb = set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']) adjective = set(['JJ', 'JJR', 'JJS']) adverb = set(['RB', 'RBR', 'RBS']) substantive = noun | verb | adjective | adverb corp = semcor.sents() tags = semcor.tagged_sents(tag = 'sem') n = 0 correct = 0 base = 0 total = 0 for sent in corp: sentence = ' '.join(sent) print sentence
l_hypos = [] for hypo_synset in synset.hyponyms(): word = hypo_synset.name().split('.')[0] if word not in l_hypos: l_hypos.append(word) if l_hypos: random_index = random.randint(0, len(l_hypos) - 1) new_sentence.append(l_hypos[random_index]) else: for w in word: new_sentence.append(w) print (' '.join(new_sentence)) if __name__ == "__main__": args = parse_command_line() l_sentence = semcor.sents()[args.index] sentence = ' '.join(l_sentence) print (sentence) s = semcor.tagged_sents(tag='sem')[args.index] # random.seed(a=0) if args.nym == 'synonym': print_synonym_sentence(s) elif args.nym == 'hypernym': print_hypernym_sentence(s) elif args.nym == 'hyponym': print_hyponym_sentence(s)
def process_semcor(ref_dict): ''' Return a DataFrame that contrains sentences along with citations and information of detected heteronyms ''' sents = semcor.sents() tagged_sents = semcor.tagged_sents(tag='sem') sense_list = list(ref_dict['sense']) semcor_sents = pd.DataFrame(columns=['sentence', 'citation', 'heteronym']) word_duplicate_sense = set(ref_dict[ref_dict.duplicated(['sense' ])]['word']) for sent_idx, sent in enumerate(tagged_sents): het_in_sent = [] for token_idx, token in enumerate(sent): if type(token) == nltk.Tree: lemma = token.label() chunk = token.leaves() ## Check whether token is a heteronym if (type(lemma) == nltk.corpus.reader.wordnet.Lemma) and ( lemma.synset() in sense_list) and (len(chunk) == 1): synset = lemma.synset() word = chunk[0] ## Take care of sense-duplcated heteronyms (rare), ## e.g. project and projects can have same sense but different pronunciations. if word.lower() in word_duplicate_sense: pron = list(ref_dict[(ref_dict['word'] == word.lower()) & (ref_dict['sense'] == synset)] ['pronunciation']) if pron: het_in_sent.append((word.lower(), synset, pron[0])) ## If sense if not duplicated, mapping to pron is one-to-one else: pron = list(ref_dict[ref_dict['sense'] == synset] ['pronunciation'])[0] word_in_ref = list( ref_dict[ref_dict['sense'] == synset]['word'])[0] if word.lower() == word_in_ref: het_in_sent.append((word_in_ref, synset, pron)) if het_in_sent: new_row = { 'sentence': "".join([ " " + i if not i.startswith("'") and i not in string.punctuation else i for i in sents[sent_idx] ]).strip(), 'citation': 'SemCor', 'heteronym': het_in_sent } semcor_sents = semcor_sents.append(new_row, ignore_index=True) return semcor_sents
elif ematrix[i][j] == 'SUB': seq.append(str(cmatrix[i][j] - cmatrix[i - 1][j - 1])) seq.append('SUB') i -= 1 j -= 1 elif ematrix[i][j] == 'INS': seq.append(str(cmatrix[i][j] - cmatrix[i][j - 1])) seq.append('INS') j -= 1 elif ematrix[i][j] == 'DEL': seq.append(str(cmatrix[i][j] - cmatrix[i - 1][j])) seq.append('DEL') i -= 1 seq = ' '.join(reversed(seq)) print(seq) if __name__ == '__main__': #...TODO Parse arguments and load semcor sentences args = parse_command_line() l_sentence1 = semcor.sents()[args.untagged] l_sentence2 = semcor.sents()[args.tagged] # TODO print sentence1 and sentence2 print(' '.join(l_sentence1)) print(' '.join(l_sentence2)) s2 = semcor.tagged_sents(tag='sem')[args.tagged] wordnet_edit_distance(l_sentence1, s2, args.sim)
# Returns the cost of deleting a word def del_cost(word): return 1 # not sure in the description it should be len(word) # Returns the cost of substituting word1 with word2 def sub_cost(word1, word2): return 1 # not sure in the description it should be max(len(word1), len(word2)) if __name__ == '__main__': #...TODO Parse arguments and load semcor sentences args = parse_command_line() l_sentence1 = semcor.sents()[args.index1] l_sentence2 = semcor.sents()[args.index2] # debug # l_sentence1 = ['A', 'Z', 'Q', 'R', 'X', 'A'] # l_sentence2 = ['A', 'Z', 'J', 'Q', 'R', 'Y'] # TODO print sentence1 and sentence2 print(' '.join(l_sentence1)) print(' '.join(l_sentence2)) n = len(l_sentence1) m = len(l_sentence2) # Matrix of cost values. TODO initialize the matrix to the correct size # Matrix of edit operations corresponding to costs in cmatrix.
def main(): print "user input(1) or semcor(2)?" num = raw_input() if num == "1": #input print "enter word" word = raw_input() for meaning in (net.synsets(word)): #print "Sense: " + re.findall("'.*'", str(meaning))[0] print "Sense: " + str(meaning) print meaning.definition() + "\n" hypernyms = (meaning.hypernyms()) if len(hypernyms) > 0: print "\nHypernyms:" for meaning2 in hypernyms: print re.findall("'.*'", str(meaning2))[0] hyponyms = (meaning.hyponyms()) if len(hyponyms) > 0: print "\nHyponyms:" for meaning2 in hyponyms: print re.findall("'.*'", str(meaning2))[0] # print "\nHypernym Tree:" # print (gethypernymtree(meaning)) print "\n" # dog = wn.synset('dog.n.01') # hypo = lambda s: s.hyponyms() # hyper = lambda s: s.hypernyms() #list(dog.closure(s.hypernyms(), depth=1)) == dog.hypernyms() #True #>>> list(dog.closure(hyper, depth=1)) == dog.hypernyms() elif (num == "2"): #semcor print "semcor" for line in semcor.sents()[0:100]: s = "" for word in line: s = s + " " + word print s + "\n" for word in line: meanings = net.synsets(word) if len(meanings) > 0: print meanings[0].definition() elif num == "3": docs = ieer.parsed_docs('APW_19980424') tree = docs[1].text from nltk.sem import relextract pairs = relextract.tree2semi_rel(tree) for s, tree in pairs[18:22]: print('("...%s", %s)' % (" ".join(s[-5:]), tree)) reldicts = relextract.semi_rel2reldict(pairs) for k, v in sorted(reldicts[0].items()): print(k, '=>', v) # The function relextract() allows us to filter the reldicts # according to the classes of the subject and object named entities. # In addition, we can specify that the filler text has to match a given regular expression, # as illustrated in the next example. Here, we are looking for pairs of entities in the IN # relation, where IN has signature <ORG, LOC>. IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)') for fileid in ieer.fileids(): print fileid for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)" ROLES = re.compile(roles, re.VERBOSE) for fileid in ieer.fileids(): for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS
from nltk.corpus import conll2000, conll2002 print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2000.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2002.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # SEMCOR from nltk.corpus import semcor print(semcor.words()) print(semcor.chunks()) print(semcor.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(semcor.chunk_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE list(map(str, semcor.tagged_chunks(tag='both')[:3])) [[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]] # IEER from nltk.corpus import ieer ieer.fileids() # doctest: +NORMALIZE_WHITESPACE docs = ieer.parsed_docs('APW_19980314') print(docs[0]) print(docs[0].docno) print(docs[0].doctype) print(docs[0].date_time) print(docs[0].headline) print(docs[0].text) # doctest: +ELLIPSIS
results = PrettyTable() results.add_column("Original Sentences", original_sentences) results.add_column("Ambiguous Word", words_to_analyze) results.add_column("Choosen Synset", choosen_synsets) results.add_column("New Sentence", new_sentences) #write the table to a file table_txt = results.get_string() with open('./output/Output Word Disambiguation.txt', 'w') as file: file.write(table_txt) #----------------------------------------------------- SEMCOR TESTING -------------------------------------------# #getting already semantically tagged sentences from semcor corpus sem_tagged_sentences = sc.tagged_sents(tag='sem')[1:50] #getting the same sentences untagged semcor_sentences = sc.sents()[1:50] #getting already pos tagged sentences from semcor corpus pos_tagged_sentences = sc.tagged_sents(tag='pos')[1:50] semtest_results = ts.semcorDisambiguation(sem_tagged_sentences, semcor_sentences, pos_tagged_sentences) semtest_sample = semtest_results[0][1:10] table_txt_semcor = semtest_results[0].get_string() with open('./output/Output Semcor Testing.txt', 'w') as file: file.write(table_txt_semcor + "\r\n") file.write("Accuracy: " + str(semtest_results[1]) + "%") sample_text = semtest_sample[0].get_string() #Su 50 prove la media di accuratezza è stata del 41,32%
# print "\nHypernym Tree:" # print (gethypernymtree(meaning)) print "\n" # dog = wn.synset('dog.n.01') # hypo = lambda s: s.hyponyms() # hyper = lambda s: s.hypernyms() #list(dog.closure(s.hypernyms(), depth=1)) == dog.hypernyms() #True #>>> list(dog.closure(hyper, depth=1)) == dog.hypernyms() elif (num == "2"): #semcor print "semcor" for line in semcor.sents()[0:100]: s = "" for word in line: s = s + " " + word print s + "\n" for word in line: meanings = net.synsets(word) if len(meanings) > 0: print meanings[0].definition() elif num == "3": docs = ieer.parsed_docs('APW_19980424') tree = docs[1].text from nltk.sem import relextract
# seme per il generatore di numeri casuali, usato per scegliere i sostantivi SEED = 117 COUNT = 50 START = 0 STOPWORDS_PATH = 'stop_words_FULL.txt' if __name__ == '__main__': random.seed(SEED) giuste = 0 semcor_sentences = extract_semcor_sentences(count=COUNT, start=START) stopw = load_stopwords(STOPWORDS_PATH) for i, tagged_sentence, nn_words in semcor_sentences: # Scelgo casualmente un sostantivo dalla frase # lemma contiene l'oggetto di tipo Lemma che contiene il nome del synset # word contiene la parola in sè lemma, word = random.choice(nn_words) sentence = semcor.sents()[i] # Effettuo la disambiguazione passandogli la frase relativa # e la parola da disambiguare, ottenendo il senso migliore syn = lesk_disambiguate(sentence, word, stopwordset=stopw) # Se il synset ottenuto con l'algoritmo di lesk è uguale al # synset preso dall'annotazione in semcor, incremento il # numero di frasi corrette if lemma.synset() == syn: giuste += 1 accuracy = giuste / COUNT * 100 print(f'Accuracy: {accuracy:.2f} %')