def get_random_word_brown():
    sentences = get_brown(semcor.sents('brown2/tagfiles/br-n12.xml'))
    list1 = semcor.tagged_chunks('brown2/tagfiles/br-n12.xml', 'pos')
    list2 = semcor.tagged_chunks('brown2/tagfiles/br-n12.xml', 'sem')

    random_word = []
    temp_buffer = []
    i = 0
    for t in tuple(zip(list1, list2)):
        pos = t[0].label()
        lemma = t[0][0]

        if pos == "NN" and hasattr(t[1], 'label'):
            if hasattr(t[1].label(), 'synset'):
                synset = t[1].label().synset()
                if hasattr(synset, 'name'):
                    temp_buffer.append((synset.name(), lemma))

        s = sentences[i]
        eof = s[-1:]
        if lemma == eof:
            if len(temp_buffer) != 0:
                random_word.append(random.choice(temp_buffer))
            else:
                random_word.append((' ', ' '))
            temp_buffer = []
            i += 1
    return zip(random_word, sentences)
Exemple #2
0
def get_semcor_corpus():
    """builds a corpus of word frequencies using SemCor"""
    corpus = []
    for sentence in semcor.sents():
        sentence_proc = preprocess(' '.join(sentence))
        for word in sentence_proc:
            corpus.append(word.lower())

    word_freq = FreqDist(corpus)

    corpus_freqs = {}
    freqs, words = [], []

    for word in corpus:
        freqs.append(word_freq[word])
        words.append(word)

    # laplace smoothing
    freqs = np.array(freqs)
    freqs += 1

    # compute inverse weighting
    N = len(word_freq)
    freqs = np.log((1 + N) / freqs)

    for freq, word in zip(freqs, words):
        corpus_freqs[word] = freq

    return (corpus_freqs)
Exemple #3
0
    def parse(self):
        tagged_sents = semcor.tagged_sents(tag='sense')
        sents = semcor.sents()

        # tagged_sents returns senses of each word/group of words
        for sent, tag in zip(sents, tagged_sents):

            word_idx = 0
            for entry in tag:
                # check for no sense tag or multiword entries
                # TODO is it ok to exclude multiword entries?
                entry_len = len(entry.leaves())
                if entry.label() and entry_len == 1 and type(
                        entry.label()) != str:
                    #import pdb; pdb.set_trace()
                    entry = entry.label().synset().name().split('.')

                    if len(entry) == 3:  # check for (word.pos.nn) entry
                        word, pos, sense = entry

                        num_senses = self.count_senses(word)
                        context = self.get_context(sent, word_idx)

                        new_ex = Example(context, word,
                                         self.parse_sense(sense), pos,
                                         num_senses)
                        # add to data set
                        self.data.append(new_ex)
                        # TODO for now just take first sense found in sentence
                        break
                word_idx += entry_len  # one entry might be multiple words
    def __iter__(self):

        sents = semcor.sents() #subset of the Brown Corpus, tokenized, but not tagged or chunked
        for s in sents :
            # ss = ' '.join(list(s))
            # temp = utils.simple_preprocess(' '.join((list(s))
            yield utils.simple_preprocess(' '.join((list(s))))
	def __init__(self, **kwargs):

		self._sents = []
		self._tagged_sents = []
		self._semcor_file_ids = self._load_semcor_file_ids()
		self._processor = kwargs.get('processor', lambda lexeme, definition, examples: (lexeme, definition, examples))

		for file_id in self._semcor_file_ids:
			self._sents.append(semcor.sents(file_id))
			self._tagged_sents.append(semcor.tagged_sents(file_id, 'both'))
def get_semcor_sentences(data_size):
    sentences, senses = [], []
    for index in range(0, data_size):
        for node in semcor.tagged_sents(tag='both')[index]:
            node_noun = None
            # If node is a noun
            if isinstance(node.label(), Lemma) and node[0].label() == 'NN':
                node_noun = node
                break
        if node_noun:
            senses.append(node)
            sentences.append(" ".join(semcor.sents()[index]))
    return sentences, senses
def semcor_extraction() -> tuple:
    sentences = []
    extracted = []

    for i in range(0, 10):
        elem = list(
            filter(
                lambda sentence_tree: isinstance(sentence_tree.label(), Lemma)
                and sentence_tree[0].label() == "NN",
                semcor.tagged_sents(tag='both')[i]))

        if elem:
            extracted.append(random.choice(elem))
            sentences.append(" ".join(semcor.sents()[i]))

    return sentences, extracted
Exemple #8
0
def semcor_extraction(sentence_number=50):
    sentences = []
    extracted = []

    for i in range(0, sentence_number):

        # Estraiamo i nomi dalla frase i
        nouns = list(filter(lambda sentence_tree:
                            isinstance(sentence_tree.label(), Lemma) and
                            sentence_tree[0].label() == "NN", semcor.tagged_sents(tag='both')[i]))

        # Scegliamo un nome a caso della frase dalla lista nouns e lo estraiamo dalla frase i
        if nouns:
            lemma = select_lemma(nouns).label()
            extracted.append(lemma)
            sentence = " ".join(semcor.sents()[i])
            sentences.append(remove_word(sentence, lemma.name()))
    return sentences, extracted
Exemple #9
0
	def loadSemcorSections(self):
		"""
		Loads semcor sections into two lists one of just senteces and one of 
		tagged sentences.
		
		Returns:
		A dictionary with keys 'chunks' and 'sentences' with values of a list
		tagged semcor sentences and a list of untagged semcor sentences.
		"""
		sentencesGroupedBySense = defaultdict(list)
		listOfFileIds = semcor.fileids()
		listOfChunks = []
		listOfSentences = []
		for fileId in listOfFileIds:
			listOfChunks.append(semcor.tagged_sents(fileId, 'both'))
			listOfSentences.append(semcor.sents(fileId))	
		listOfChunks = self.removeLevelsOfListWithinList(listOfChunks)
		listOfSentences = self.removeLevelsOfListWithinList(listOfSentences)
		semcorData = {'chunks':listOfChunks, 'sentences':listOfSentences}
		return semcorData
Exemple #10
0
	def loadSemcorSections(self):
		"""
		Loads semcor sections into two lists one of just senteces and one of 
		tagged sentences.
		
		Returns:
		A dictionary with keys 'chunks' and 'sentences' with values of a list
		tagged semcor sentences and a list of untagged semcor sentences.
		"""
		sentencesGroupedBySense = defaultdict(list)
		listOfFileIds = semcor.fileids()
		listOfChunks = []
		listOfSentences = []
		for fileId in listOfFileIds:
			listOfChunks.append(semcor.tagged_sents(fileId, 'both'))
			listOfSentences.append(semcor.sents(fileId))	
		listOfChunks = self.removeLevelsOfListWithinList(listOfChunks)
		listOfSentences = self.removeLevelsOfListWithinList(listOfSentences)
		semcorData = {'chunks':listOfChunks, 'sentences':listOfSentences}
		return semcorData
Exemple #11
0
def semcor_extraction(sentence_number: int = 50) -> tuple:
    """
    Extracts `sentence_number` sentences from the semcore corpus.
    From each of them extracts also a random noun.
    :return: Returns a tuple (extracted sentences list, extracted nouns list)
    """
    sentences = []
    extracted = []

    for i in range(0, sentence_number):
        elem = list(
            filter(
                lambda sentence_tree: isinstance(sentence_tree.label(), Lemma)
                and sentence_tree[0].label() == "NN",
                semcor.tagged_sents(tag='both')[i]))

        if elem:
            extracted.append(random.choice(elem))
            sentences.append(" ".join(semcor.sents()[i]))

    return sentences, extracted
Exemple #12
0
def process_semcor():
    print 'semcor'
    from nltk.corpus import semcor
    count = 0
    word = 'bank'
    sen1 = 'depository_financial_institution.n.01'
    sen2 = 'bank.n.01'
    file_name = 'data/bank_semcor_labelled_tmp.txt'
    for f in semcor.fileids():
        sents = semcor.sents(f)
        tsents = semcor.tagged_sents(f, 'sem')
        for i in range(len(sents)):
            sent = sents[i]
            if (word in sent):
                if (sen1 in str(tsents[i])):
                    appendToFile(file_name, sentToStr(sent, '+'))
                elif (sen2 in str(tsents[i])):
                    appendToFile(file_name, sentToStr(sent, '-'))
                else:
                    appendToFile(file_name, sentToStr(sent, '0'))
                count = count + 1
                print count
    if word in lemmas:
        lemmas[word][descriptor] = lemmas[word][descriptor] + 1 if descriptor in lemmas[word] else 1
    else:
        lemmas[word] = {descriptor: 1}  # this else statement prevents keyerror lookups on lemmas[word][synset]
    return word

print("Importing Lemma and Synsets")
lemmas = dict()
    # lemmas is a dict of dict,
    # lemmas[word] = dictionary of { synsets:frequency of synset when associated with a 'word' }
    # lemmas[word][synset] is a count of how many times a synset appears for each word
    # *** len(lemmas[word]) = the number of different senses a 'word' has in the corpus
taggedsentences = semcor.tagged_sents(tag='both')
    # all sentences, fully tagged from SEMCOR
plaintextsentences = semcor.sents()
    # all sentences from SEMCOR
targetsentences = {}
    # sentences containing 'point'
pos = dict()
    # list of part of speech tags from the corpus
max_sentence_len = 0
lemmacount = {}


# find all sentences including exactly 1 occurence of 'back'
# not all of these sentences are related to the synsets we are looking for
# e.g. goes back relates to the verb go instead of back
for i, s in enumerate(plaintextsentences) :
    ss = ' '.join(list(s))
    if ss.count(' back ') == 1: 
Exemple #14
0
  try:
    from mpi4py import MPI
    comm = MPI.COMM_WORLD
  except ImportError:
    comm = None

  if feature:

    if feature == 'ngram':
      featurevocab = ngram_vocab(n)
      with open(corpusfile, 'r') as f:
        matrix, featurecounts, wordcounts = cooc_matrix((line.split() for line in f), featurevocab, vocab, n=n, unk=UNK, verbose=True, comm=comm)
    elif feature == 'synset':
      featurevocab = synset_vocab()
      matrix, featurecounts, wordcounts = cooc_matrix(semcor.sents(), featurevocab, vocab, doc2wnd=synset_context(iter(semcor.tagged_sents(tag='sem'))), unk=UNK, interval=100, verbose=True, wndo2=None)
      featurevocab = [synset.name() for synset in featurevocab]
    else:
      raise(NotImplementedError)

    if comm is None or not comm.rank:
      sp.save_npz(outputroot+'.npz', matrix)
      with open(outputroot+'.pkl', 'wb') as f:
        pickle.dump({'words': vocab, feature+'s': featurevocab, 'wordcounts': wordcounts, feature+'counts': featurecounts}, f)
    else:
      sys.exit()

  else:

    with open(corpusfile, 'r') as f:
      matrix, counts = symmetric_cooc_matrix((line.split() for line in f), vocab, unk=UNK, verbose=True, comm=comm)
Exemple #15
0
from nltk.corpus import wordnet

import nltk
from nltk.tree import Tree
from nltk.corpus.reader.wordnet import Lemma
from nltk.corpus import semcor
from nltk.corpus import wordnet

noun = set(['NN', 'NNS', 'NNP', 'NNPS'])
verb = set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])
adjective = set(['JJ', 'JJR', 'JJS'])
adverb = set(['RB', 'RBR', 'RBS'])
substantive = noun | verb | adjective | adverb

corp = semcor.sents()

tags = semcor.tagged_sents(tag = 'sem')

n = 0

correct = 0
base = 0
total = 0

for sent in corp:

    sentence =  ' '.join(sent)

    print sentence
			l_hypos = []
			for hypo_synset in synset.hyponyms():
				word = hypo_synset.name().split('.')[0]
				if word not in l_hypos:
					l_hypos.append(word)
			if l_hypos:
				random_index = random.randint(0, len(l_hypos) - 1)
				new_sentence.append(l_hypos[random_index])
			else:
				for w in word:
					new_sentence.append(w)
	print (' '.join(new_sentence))

if __name__ == "__main__":

	args = parse_command_line()

	l_sentence = semcor.sents()[args.index]
	sentence = ' '.join(l_sentence)

	print (sentence)

	s = semcor.tagged_sents(tag='sem')[args.index]
	# random.seed(a=0)

	if args.nym == 'synonym':
		print_synonym_sentence(s)
	elif args.nym == 'hypernym':
		print_hypernym_sentence(s)
	elif args.nym == 'hyponym':
		print_hyponym_sentence(s)
def process_semcor(ref_dict):
    '''
    Return a DataFrame that contrains sentences along with citations and information of detected heteronyms
    '''
    sents = semcor.sents()
    tagged_sents = semcor.tagged_sents(tag='sem')

    sense_list = list(ref_dict['sense'])
    semcor_sents = pd.DataFrame(columns=['sentence', 'citation', 'heteronym'])
    word_duplicate_sense = set(ref_dict[ref_dict.duplicated(['sense'
                                                             ])]['word'])

    for sent_idx, sent in enumerate(tagged_sents):
        het_in_sent = []
        for token_idx, token in enumerate(sent):

            if type(token) == nltk.Tree:
                lemma = token.label()
                chunk = token.leaves()

                ## Check whether token is a heteronym
                if (type(lemma) == nltk.corpus.reader.wordnet.Lemma) and (
                        lemma.synset() in sense_list) and (len(chunk) == 1):

                    synset = lemma.synset()
                    word = chunk[0]

                    ## Take care of sense-duplcated heteronyms (rare),
                    ## e.g. project and projects can have same sense but different pronunciations.
                    if word.lower() in word_duplicate_sense:
                        pron = list(ref_dict[(ref_dict['word'] == word.lower())
                                             & (ref_dict['sense'] == synset)]
                                    ['pronunciation'])
                        if pron:
                            het_in_sent.append((word.lower(), synset, pron[0]))

                    ## If sense if not duplicated, mapping to pron is one-to-one
                    else:
                        pron = list(ref_dict[ref_dict['sense'] == synset]
                                    ['pronunciation'])[0]
                        word_in_ref = list(
                            ref_dict[ref_dict['sense'] == synset]['word'])[0]
                        if word.lower() == word_in_ref:
                            het_in_sent.append((word_in_ref, synset, pron))

        if het_in_sent:
            new_row = {
                'sentence':
                "".join([
                    " " + i if not i.startswith("'")
                    and i not in string.punctuation else i
                    for i in sents[sent_idx]
                ]).strip(),
                'citation':
                'SemCor',
                'heteronym':
                het_in_sent
            }
            semcor_sents = semcor_sents.append(new_row, ignore_index=True)

    return semcor_sents
Exemple #18
0
        elif ematrix[i][j] == 'SUB':
            seq.append(str(cmatrix[i][j] - cmatrix[i - 1][j - 1]))
            seq.append('SUB')
            i -= 1
            j -= 1
        elif ematrix[i][j] == 'INS':
            seq.append(str(cmatrix[i][j] - cmatrix[i][j - 1]))
            seq.append('INS')
            j -= 1
        elif ematrix[i][j] == 'DEL':
            seq.append(str(cmatrix[i][j] - cmatrix[i - 1][j]))
            seq.append('DEL')
            i -= 1
    seq = ' '.join(reversed(seq))
    print(seq)


if __name__ == '__main__':
    #...TODO Parse arguments and load semcor sentences
    args = parse_command_line()

    l_sentence1 = semcor.sents()[args.untagged]
    l_sentence2 = semcor.sents()[args.tagged]

    # TODO print sentence1 and sentence2
    print(' '.join(l_sentence1))
    print(' '.join(l_sentence2))

    s2 = semcor.tagged_sents(tag='sem')[args.tagged]

    wordnet_edit_distance(l_sentence1, s2, args.sim)
Exemple #19
0
# Returns the cost of deleting a word
def del_cost(word):
    return 1  # not sure in the description it should be len(word)


# Returns the cost of substituting word1 with word2
def sub_cost(word1, word2):
    return 1  # not sure in the description it should be max(len(word1), len(word2))


if __name__ == '__main__':
    #...TODO Parse arguments and load semcor sentences
    args = parse_command_line()

    l_sentence1 = semcor.sents()[args.index1]
    l_sentence2 = semcor.sents()[args.index2]

    # debug
    # l_sentence1 = ['A', 'Z', 'Q', 'R', 'X', 'A']
    # l_sentence2 = ['A', 'Z', 'J', 'Q', 'R', 'Y']

    # TODO print sentence1 and sentence2
    print(' '.join(l_sentence1))
    print(' '.join(l_sentence2))

    n = len(l_sentence1)
    m = len(l_sentence2)

    # Matrix of cost values. TODO initialize the matrix to the correct size
    # Matrix of edit operations corresponding to costs in cmatrix.
Exemple #20
0
def main():
    print "user input(1) or semcor(2)?"

    num = raw_input()

    if num == "1":
        #input
        print "enter word"
        word = raw_input()
        for meaning in (net.synsets(word)):
            #print "Sense: " + re.findall("'.*'", str(meaning))[0]
            print "Sense: " + str(meaning)
            print meaning.definition() + "\n"
            hypernyms = (meaning.hypernyms())
            if len(hypernyms) > 0:
                print "\nHypernyms:"
                for meaning2 in hypernyms:
                    print re.findall("'.*'", str(meaning2))[0]

            hyponyms = (meaning.hyponyms())
            if len(hyponyms) > 0:
                print "\nHyponyms:"
                for meaning2 in hyponyms:
                    print re.findall("'.*'", str(meaning2))[0]

    #		print "\nHypernym Tree:"
    #		print (gethypernymtree(meaning))
            print "\n"

    #		dog = wn.synset('dog.n.01')
    #		hypo = lambda s: s.hyponyms()
    #	 	hyper = lambda s: s.hypernyms()
    #list(dog.closure(s.hypernyms(), depth=1)) == dog.hypernyms()
    #True
    #>>> list(dog.closure(hyper, depth=1)) == dog.hypernyms()

    elif (num == "2"):
        #semcor
        print "semcor"

        for line in semcor.sents()[0:100]:
            s = ""
            for word in line:
                s = s + " " + word
            print s + "\n"

            for word in line:
                meanings = net.synsets(word)
                if len(meanings) > 0:
                    print meanings[0].definition()
    elif num == "3":

        docs = ieer.parsed_docs('APW_19980424')
        tree = docs[1].text

        from nltk.sem import relextract
        pairs = relextract.tree2semi_rel(tree)
        for s, tree in pairs[18:22]:
            print('("...%s", %s)' % (" ".join(s[-5:]), tree))

        reldicts = relextract.semi_rel2reldict(pairs)
        for k, v in sorted(reldicts[0].items()):
            print(k, '=>', v)

    #	The function relextract() allows us to filter the reldicts
    #	according to the classes of the subject and object named entities.
    #	In addition, we can specify that the filler text has to match a given regular expression,
    #	 as illustrated in the next example. Here, we are looking for pairs of entities in the IN
    #	relation, where IN has signature <ORG, LOC>.
        IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)')
        for fileid in ieer.fileids():
            print fileid
            for doc in ieer.parsed_docs(fileid):
                for rel in relextract.extract_rels('ORG',
                                                   'LOC',
                                                   doc,
                                                   corpus='ieer',
                                                   pattern=IN):
                    print(relextract.rtuple(rel))  # doctest: +ELLIPSIS

        roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)"

        ROLES = re.compile(roles, re.VERBOSE)
        for fileid in ieer.fileids():
            for doc in ieer.parsed_docs(fileid):
                for rel in relextract.extract_rels('PER',
                                                   'ORG',
                                                   doc,
                                                   corpus='ieer',
                                                   pattern=ROLES):
                    print(relextract.rtuple(rel))  # doctest: +ELLIPSIS
from nltk.corpus import conll2000, conll2002
print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2000.chunked_sents()[:2]:
    print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2002.chunked_sents()[:2]:
    print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE


# SEMCOR
    
from nltk.corpus import semcor
print(semcor.words())
print(semcor.chunks())
print(semcor.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(semcor.chunk_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
list(map(str, semcor.tagged_chunks(tag='both')[:3]))
[[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]]    

# IEER

from nltk.corpus import ieer
ieer.fileids() # doctest: +NORMALIZE_WHITESPACE
docs = ieer.parsed_docs('APW_19980314')
print(docs[0])
print(docs[0].docno)
print(docs[0].doctype)
print(docs[0].date_time)
print(docs[0].headline)
print(docs[0].text) # doctest: +ELLIPSIS
Exemple #22
0
results = PrettyTable()
results.add_column("Original Sentences", original_sentences)
results.add_column("Ambiguous Word", words_to_analyze)
results.add_column("Choosen Synset", choosen_synsets)
results.add_column("New Sentence", new_sentences)

#write the table to a file
table_txt = results.get_string()
with open('./output/Output Word Disambiguation.txt', 'w') as file:
    file.write(table_txt)

#----------------------------------------------------- SEMCOR TESTING -------------------------------------------#

#getting already semantically tagged sentences from semcor corpus
sem_tagged_sentences = sc.tagged_sents(tag='sem')[1:50]
#getting the same sentences untagged
semcor_sentences = sc.sents()[1:50]
#getting already pos tagged sentences from semcor corpus
pos_tagged_sentences = sc.tagged_sents(tag='pos')[1:50]
semtest_results = ts.semcorDisambiguation(sem_tagged_sentences,
                                          semcor_sentences,
                                          pos_tagged_sentences)
semtest_sample = semtest_results[0][1:10]
table_txt_semcor = semtest_results[0].get_string()
with open('./output/Output Semcor Testing.txt', 'w') as file:
    file.write(table_txt_semcor + "\r\n")
    file.write("Accuracy: " + str(semtest_results[1]) + "%")
sample_text = semtest_sample[0].get_string()

#Su 50 prove la media di accuratezza è stata del 41,32%
Exemple #23
0
#		print "\nHypernym Tree:"
#		print (gethypernymtree(meaning))
        print "\n"

#		dog = wn.synset('dog.n.01')
#		hypo = lambda s: s.hyponyms()
#	 	hyper = lambda s: s.hypernyms()
#list(dog.closure(s.hypernyms(), depth=1)) == dog.hypernyms()
#True
#>>> list(dog.closure(hyper, depth=1)) == dog.hypernyms()

elif (num == "2"):
    #semcor
    print "semcor"

    for line in semcor.sents()[0:100]:
        s = ""
        for word in line:
            s = s + " " + word
        print s + "\n"

        for word in line:
            meanings = net.synsets(word)
            if len(meanings) > 0:
                print meanings[0].definition()
elif num == "3":

    docs = ieer.parsed_docs('APW_19980424')
    tree = docs[1].text

    from nltk.sem import relextract
Exemple #24
0
# seme per il generatore di numeri casuali, usato per scegliere i sostantivi
SEED = 117
COUNT = 50
START = 0
STOPWORDS_PATH = 'stop_words_FULL.txt'

if __name__ == '__main__':
    random.seed(SEED)

    giuste = 0
    semcor_sentences = extract_semcor_sentences(count=COUNT, start=START)
    stopw = load_stopwords(STOPWORDS_PATH)

    for i, tagged_sentence, nn_words in semcor_sentences:
        # Scelgo casualmente un sostantivo dalla frase
        # lemma contiene l'oggetto di tipo Lemma che contiene il nome del synset
        # word contiene la parola in sè
        lemma, word = random.choice(nn_words)
        sentence = semcor.sents()[i]
        # Effettuo la disambiguazione passandogli la frase relativa
        # e la parola da disambiguare, ottenendo il senso migliore
        syn = lesk_disambiguate(sentence, word, stopwordset=stopw)
        # Se il synset ottenuto con l'algoritmo di lesk è uguale al
        # synset preso dall'annotazione in semcor, incremento il
        # numero di frasi corrette
        if lemma.synset() == syn:
            giuste += 1

    accuracy = giuste / COUNT * 100

    print(f'Accuracy: {accuracy:.2f} %')