Exemple #1
0
    def semcor_sentences(self, labeled=True, original_sense=False):
        sentences = []
        for s in semcor.tagged_sents(tag="both"):
            triplets = []
            for w in s:
                sense = w.label()
                if original_sense:
                    triplets += [(sense, w, self.clean_pos(w, p))
                                 for (w, p) in w.pos()]
                else:
                    triplets += [(self.check_sense(sense=sense,
                                                   word=w,
                                                   tag=self.clean_pos(w,
                                                                      p)), w,
                                  self.clean_pos(w, p)) for (w, p) in w.pos()]

            senses, words, tags = zip(*triplets)

            if labeled:
                sentence = Sentence(words=words,
                                    pos_tags=tags,
                                    senses=self.clean_labels(senses))
            #for testing
            else:
                sentence = Sentence(words=words, pos_tags=tags)

            sentences.append(sentence)
        return sentences
Exemple #2
0
def generate_semcor_data(filename):
    # Exclude the files containing verb only annotations
    print("Loading semcor data from nltk, excluding verb only annotation.")
    file_ids = list(filter(lambda k: 'brownv' not in k, semcor.fileids()))
    tagged_sents = semcor.tagged_sents(fileids=file_ids, tag='both')
    
    sent_words = []     # Sentences
    sent_labels = []    # Super-Sense Labels
    
    print("Writing semcor raw data to {}...".format(filename))
    i = 0
    f = open(filename, 'w')
    for sent in tagged_sents:
        i += 1
        words = []
        labels = []
        for tag in sent:
            _words = tag.leaves()
            _supersenses = get_supersenses(_words, tag.label())
            
            words = np.append(words, _words)
            labels = np.append(labels, _supersenses)
        sys.stdout.write('\rWrote {} examples.\r'.format(i))
        sys.stdout.flush()
        sentence = " ".join(words.flatten())
        tags     = " ".join(labels.flatten())
        f.write("{} <||> {}\n".format(sentence, tags))
    f.close()
Exemple #3
0
    def parse(self):
        tagged_sents = semcor.tagged_sents(tag='sense')
        sents = semcor.sents()

        # tagged_sents returns senses of each word/group of words
        for sent, tag in zip(sents, tagged_sents):

            word_idx = 0
            for entry in tag:
                # check for no sense tag or multiword entries
                # TODO is it ok to exclude multiword entries?
                entry_len = len(entry.leaves())
                if entry.label() and entry_len == 1 and type(
                        entry.label()) != str:
                    #import pdb; pdb.set_trace()
                    entry = entry.label().synset().name().split('.')

                    if len(entry) == 3:  # check for (word.pos.nn) entry
                        word, pos, sense = entry

                        num_senses = self.count_senses(word)
                        context = self.get_context(sent, word_idx)

                        new_ex = Example(context, word,
                                         self.parse_sense(sense), pos,
                                         num_senses)
                        # add to data set
                        self.data.append(new_ex)
                        # TODO for now just take first sense found in sentence
                        break
                word_idx += entry_len  # one entry might be multiple words
def CollectSemcorSupersenses():
  oracle_matrix = collections.defaultdict(WordSupersenses)
  for sent in semcor.tagged_sents(tag='both'):
    for chk in sent:
      if chk.node and len(chk.node)>3 and chk.node[-3]=='.' and chk.node[-2:].isdigit():
        if chk[0].node.startswith('N'):
          pos = "n"
        elif chk[0].node.startswith('V'):
          pos = "v"
        else:
          continue
        lemmas = chk.node[:-3]
        wnsn = int(chk.node[-2:])
        ssets = wn.synsets(lemmas, pos)
        sorted_ssets = sorted(ssets, key=lambda x: x.name)
        filtered_ssets = None
        for lemma in lemmas.split("_"):  
          if not filtered_ssets or len(filtered_ssets) == 0:
            filtered_ssets = filter(lambda x: lemma in x.name, sorted_ssets)
        if filtered_ssets and len(filtered_ssets) > 0:
          sorted_ssets = filtered_ssets
        try:
          supersense = sorted_ssets[wnsn-1].lexname # prints 'noun.group
        except:
          #print("."),
          continue
        for lemma in lemmas.split("_"):        
          ssets = wn.synsets(lemma, pos)
          if len(ssets) > 0:
            if lemma.isdigit():
              lemma = "0"
            oracle_matrix[lemma].Add(supersense, "semcor")  
  return oracle_matrix      
Exemple #5
0
def pickleDataSet(filestr):    
    try :
        file = open(filestr,'wb')
        filetext = open(filestr+'.txt','wb')
        filemap = open(filestr+'.map','wb')
        sents=semcor.tagged_sents(tag='both')
        ps = PorterStemmer()
        datalist=[]
        wordaddressmap = {}
        def addWordAddress(word, linenum):
            if word not in wordaddressmap:
                wordaddressmap[word]=set([])
            st = wordaddressmap[word]
            st.add(linenum)
        def getWordAdresses(word):
            if word in wordaddressmap:
                return wordaddressmap[word]
            return None
        for i,s in enumerate(sents):
            sentence,sentencedata=getFeaturesInSentence(s,ps,debugSentIndex=i)
            datalist.append(sentencedata)
            filetext.write(sentence+'\n')
            for word in nltk.word_tokenize(sentence):
                addWordAddress(word,i)
        pickle.dump(datalist,file)
        pickle.dump(wordaddressmap,filemap)
    except pickle.PicklingError, pe:
        print e
	def __init__(self, **kwargs):

		self._sents = []
		self._tagged_sents = []
		self._semcor_file_ids = self._load_semcor_file_ids()
		self._processor = kwargs.get('processor', lambda lexeme, definition, examples: (lexeme, definition, examples))

		for file_id in self._semcor_file_ids:
			self._sents.append(semcor.sents(file_id))
			self._tagged_sents.append(semcor.tagged_sents(file_id, 'both'))
Exemple #7
0
def extract_semcor_sentences(count=50, start=0):
    semcor_sentences = []
    # Estraggo la frase dal SemCor sottoforma di chunks, ognuno dei
    # quali con il proprio PoS
    for i, sentence in enumerate(semcor.tagged_sents(tag='both')[start:]):
        #Cerco i sostantivi nella frase
        nn_words = get_nn_words(sentence)
        if len(nn_words) > 0:
            semcor_sentences.append((i + start, sentence, nn_words))
        if len(semcor_sentences) == count:
            break
    return semcor_sentences
def get_semcor_sentences(data_size):
    sentences, senses = [], []
    for index in range(0, data_size):
        for node in semcor.tagged_sents(tag='both')[index]:
            node_noun = None
            # If node is a noun
            if isinstance(node.label(), Lemma) and node[0].label() == 'NN':
                node_noun = node
                break
        if node_noun:
            senses.append(node)
            sentences.append(" ".join(semcor.sents()[index]))
    return sentences, senses
Exemple #9
0
def prepareDataSet():
    sents=[[re.split('\(',str(c)) for c in s] for s in semcor.tagged_sents(tag='both')[:10]]
    ps = PorterStemmer()
    siz = int(len(sents)*0.9)
    trainList=[]
    testList=[]
    for s in sents[:siz]:
        trainList.extend(getFeaturesInSentence(s,ps))
    for s in sents[siz:]:
        testList.extend(getFeaturesInSentence(s,ps))
    train_set = nltk.classify.util.apply_features(extract_features, trainList,labeled=False)
    test_set = nltk.classify.util.apply_features(extract_features, testList,labeled=False)
    return train_set, test_set
def semcor_extraction() -> tuple:
    sentences = []
    extracted = []

    for i in range(0, 10):
        elem = list(
            filter(
                lambda sentence_tree: isinstance(sentence_tree.label(), Lemma)
                and sentence_tree[0].label() == "NN",
                semcor.tagged_sents(tag='both')[i]))

        if elem:
            extracted.append(random.choice(elem))
            sentences.append(" ".join(semcor.sents()[i]))

    return sentences, extracted
Exemple #11
0
    def establich_signature(self):
        signatures = {}
        for s in semcor.tagged_sents(tag='both'):
            words = []
            for tree in s:
                if tree.label().__class__.__name__ == 'Lemma':
                    words.append(tree.label().name())
            words = set(words)
            print(words)
            for tree in s:
                if tree.label().__class__.__name__ == 'Lemma':
                    if tree.label().synset().name() not in signatures.keys():
                        signatures[tree.label().synset().name()] = set()
                    signatures[tree.label().synset().name()] |= words

        with open('signatures.pickle', 'wb') as f:
            pickle.dump(signatures, f)
Exemple #12
0
def create_semcor_data_files(length):

    print("loading semcor...")
    sentences = semcor.chunk_sents()
    senses = [[str(c) for c in s]
              for s in semcor.tagged_sents(tag='both')[:length]]

    with open('data/sense.pkl', 'wb') as outfile:
        pickle.dump(senses, outfile, pickle.HIGHEST_PROTOCOL)

    print("semcor loaded")

    if length != -1:

        return sentences[:length]
    else:
        return sentences
Exemple #13
0
def semcor_extraction(sentence_number=50):
    sentences = []
    extracted = []

    for i in range(0, sentence_number):

        # Estraiamo i nomi dalla frase i
        nouns = list(filter(lambda sentence_tree:
                            isinstance(sentence_tree.label(), Lemma) and
                            sentence_tree[0].label() == "NN", semcor.tagged_sents(tag='both')[i]))

        # Scegliamo un nome a caso della frase dalla lista nouns e lo estraiamo dalla frase i
        if nouns:
            lemma = select_lemma(nouns).label()
            extracted.append(lemma)
            sentence = " ".join(semcor.sents()[i])
            sentences.append(remove_word(sentence, lemma.name()))
    return sentences, extracted
Exemple #14
0
	def loadSemcorSections(self):
		"""
		Loads semcor sections into two lists one of just senteces and one of 
		tagged sentences.
		
		Returns:
		A dictionary with keys 'chunks' and 'sentences' with values of a list
		tagged semcor sentences and a list of untagged semcor sentences.
		"""
		sentencesGroupedBySense = defaultdict(list)
		listOfFileIds = semcor.fileids()
		listOfChunks = []
		listOfSentences = []
		for fileId in listOfFileIds:
			listOfChunks.append(semcor.tagged_sents(fileId, 'both'))
			listOfSentences.append(semcor.sents(fileId))	
		listOfChunks = self.removeLevelsOfListWithinList(listOfChunks)
		listOfSentences = self.removeLevelsOfListWithinList(listOfSentences)
		semcorData = {'chunks':listOfChunks, 'sentences':listOfSentences}
		return semcorData
Exemple #15
0
	def loadSemcorSections(self):
		"""
		Loads semcor sections into two lists one of just senteces and one of 
		tagged sentences.
		
		Returns:
		A dictionary with keys 'chunks' and 'sentences' with values of a list
		tagged semcor sentences and a list of untagged semcor sentences.
		"""
		sentencesGroupedBySense = defaultdict(list)
		listOfFileIds = semcor.fileids()
		listOfChunks = []
		listOfSentences = []
		for fileId in listOfFileIds:
			listOfChunks.append(semcor.tagged_sents(fileId, 'both'))
			listOfSentences.append(semcor.sents(fileId))	
		listOfChunks = self.removeLevelsOfListWithinList(listOfChunks)
		listOfSentences = self.removeLevelsOfListWithinList(listOfSentences)
		semcorData = {'chunks':listOfChunks, 'sentences':listOfSentences}
		return semcorData
Exemple #16
0
def load(word):
    train_instances = []

    for sentence in semcor.tagged_sents(tag='sem')[:]:
        context = get_context(sentence)
        for el in sentence:
            if type(el) is Tree:
                # type(el) is Tree
                lemm = ' '.join(el.leaves())

                if word != None and lemm != word:
                    continue

                try:
                    golden_key = el.label().key()
                except AttributeError:
                    continue
                one_instance = custom_WSDInstance(lemm, context, golden_key)
                train_instances.append(one_instance)

    return train_instances
Exemple #17
0
	def semcor_sentences(self, labeled=True, original_sense=False):
		sentences = []
		for s in semcor.tagged_sents(tag="both"):
			triplets = []
			for w in s:
				sense = w.label()
				if original_sense:
					triplets += [(sense, w, self.clean_pos(w, p)) for (w,p) in w.pos()]
				else:
					triplets += [(self.check_sense(sense=sense, word=w, tag=self.clean_pos(w, p)), w, self.clean_pos(w, p)) for (w,p) in w.pos()]

			senses, words, tags = zip(*triplets)			

			if labeled:
				sentence = Sentence(words=words, pos_tags=tags, senses=self.clean_labels(senses))
			#for testing
			else:
				sentence = Sentence(words=words, pos_tags=tags)
			
			sentences.append(sentence)
		return sentences
Exemple #18
0
def semcor_extraction(sentence_number: int = 50) -> tuple:
    """
    Extracts `sentence_number` sentences from the semcore corpus.
    From each of them extracts also a random noun.
    :return: Returns a tuple (extracted sentences list, extracted nouns list)
    """
    sentences = []
    extracted = []

    for i in range(0, sentence_number):
        elem = list(
            filter(
                lambda sentence_tree: isinstance(sentence_tree.label(), Lemma)
                and sentence_tree[0].label() == "NN",
                semcor.tagged_sents(tag='both')[i]))

        if elem:
            extracted.append(random.choice(elem))
            sentences.append(" ".join(semcor.sents()[i]))

    return sentences, extracted
Exemple #19
0
def process_semcor():
    print 'semcor'
    from nltk.corpus import semcor
    count = 0
    word = 'bank'
    sen1 = 'depository_financial_institution.n.01'
    sen2 = 'bank.n.01'
    file_name = 'data/bank_semcor_labelled_tmp.txt'
    for f in semcor.fileids():
        sents = semcor.sents(f)
        tsents = semcor.tagged_sents(f, 'sem')
        for i in range(len(sents)):
            sent = sents[i]
            if (word in sent):
                if (sen1 in str(tsents[i])):
                    appendToFile(file_name, sentToStr(sent, '+'))
                elif (sen2 in str(tsents[i])):
                    appendToFile(file_name, sentToStr(sent, '-'))
                else:
                    appendToFile(file_name, sentToStr(sent, '0'))
                count = count + 1
                print count
Exemple #20
0
def CollectSemcorSupersenses():
    oracle_matrix = collections.defaultdict(WordSupersenses)
    for sent in semcor.tagged_sents(tag='both'):
        for chk in sent:
            if chk.node and len(chk.node) > 3 and chk.node[
                    -3] == '.' and chk.node[-2:].isdigit():
                if chk[0].node.startswith('N'):
                    pos = "n"
                elif chk[0].node.startswith('V'):
                    pos = "v"
                else:
                    continue
                lemmas = chk.node[:-3]
                wnsn = int(chk.node[-2:])
                ssets = wn.synsets(lemmas, pos)
                sorted_ssets = sorted(ssets, key=lambda x: x.name)
                filtered_ssets = None
                for lemma in lemmas.split("_"):
                    if not filtered_ssets or len(filtered_ssets) == 0:
                        filtered_ssets = filter(lambda x: lemma in x.name,
                                                sorted_ssets)
                if filtered_ssets and len(filtered_ssets) > 0:
                    sorted_ssets = filtered_ssets
                try:
                    supersense = sorted_ssets[wnsn -
                                              1].lexname  # prints 'noun.group
                except:
                    #print("."),
                    continue
                for lemma in lemmas.split("_"):
                    ssets = wn.synsets(lemma, pos)
                    if len(ssets) > 0:
                        if lemma.isdigit():
                            lemma = "0"
                        oracle_matrix[lemma].Add(supersense, "semcor")
    return oracle_matrix
Exemple #21
0
#creating a table containing the results
results = PrettyTable()
results.add_column("Original Sentences", original_sentences)
results.add_column("Ambiguous Word", words_to_analyze)
results.add_column("Choosen Synset", choosen_synsets)
results.add_column("New Sentence", new_sentences)

#write the table to a file
table_txt = results.get_string()
with open('./output/Output Word Disambiguation.txt', 'w') as file:
    file.write(table_txt)

#----------------------------------------------------- SEMCOR TESTING -------------------------------------------#

#getting already semantically tagged sentences from semcor corpus
sem_tagged_sentences = sc.tagged_sents(tag='sem')[1:50]
#getting the same sentences untagged
semcor_sentences = sc.sents()[1:50]
#getting already pos tagged sentences from semcor corpus
pos_tagged_sentences = sc.tagged_sents(tag='pos')[1:50]
semtest_results = ts.semcorDisambiguation(sem_tagged_sentences,
                                          semcor_sentences,
                                          pos_tagged_sentences)
semtest_sample = semtest_results[0][1:10]
table_txt_semcor = semtest_results[0].get_string()
with open('./output/Output Semcor Testing.txt', 'w') as file:
    file.write(table_txt_semcor + "\r\n")
    file.write("Accuracy: " + str(semtest_results[1]) + "%")
sample_text = semtest_sample[0].get_string()

#Su 50 prove la media di accuratezza è stata del 41,32%
Exemple #22
0
## Extract sentences for different senses of lemmas specified below from SemCor
targets = (("capital", "n"), ("interest", "n"), ("motion", "n"), ("plant", "n"), ("space", "n"), ("suit", "n"), ("tank", "n"), ("vessel", "n")) # natural ambiguous nouns from Schuetze (1998)
## "ruling" (2nd sense = verb gerund) and "train" (noun vs. verbs) have been excluded
targets += (("bank", "n"), ("hand", "n"), ("room", "n")) # some other interesting ambiguous nouns
targets += (("find", "v"), ("grasp", "v"), ("open", "v"), ("run", "v")) # try some verbs

import sys
import nltk
from nltk.corpus import semcor, wordnet
from nltk.stem.wordnet import WordNetLemmatizer
wnl = WordNetLemmatizer()

files = [id for id in semcor.fileids() if not id.startswith("brownv")]
# files = "brown1/tagfiles/br-j05.xml" # for testing

sents_sem = semcor.tagged_sents(fileids=files, tag="sem")
sents_pos = semcor.tagged_sents(fileids=files, tag="pos")
# sents_sem = sents_sem[10:13] # for testing
# sents_pos = sents_pos[10:13]

pos2wn = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

print("id\tsid\ttarget\tpos\tsense\tgloss\tsentence\thw\tlemma")
s_num = 0
item_num = {t: 0 for t in targets}
for s_sem, s_pos in zip(sents_sem, sents_pos):
    s_num += 1
    
    lemmas = [x.label() for x in s_sem if isinstance(x, nltk.tree.Tree)] # annotated WordNet lemmas (= synset.hw)
    lemmas = [x for x in lemmas if isinstance(x, nltk.corpus.reader.wordnet.Lemma)] # skip entries where sense isn't a Lemma object
Exemple #23
0
  try:
    from mpi4py import MPI
    comm = MPI.COMM_WORLD
  except ImportError:
    comm = None

  if feature:

    if feature == 'ngram':
      featurevocab = ngram_vocab(n)
      with open(corpusfile, 'r') as f:
        matrix, featurecounts, wordcounts = cooc_matrix((line.split() for line in f), featurevocab, vocab, n=n, unk=UNK, verbose=True, comm=comm)
    elif feature == 'synset':
      featurevocab = synset_vocab()
      matrix, featurecounts, wordcounts = cooc_matrix(semcor.sents(), featurevocab, vocab, doc2wnd=synset_context(iter(semcor.tagged_sents(tag='sem'))), unk=UNK, interval=100, verbose=True, wndo2=None)
      featurevocab = [synset.name() for synset in featurevocab]
    else:
      raise(NotImplementedError)

    if comm is None or not comm.rank:
      sp.save_npz(outputroot+'.npz', matrix)
      with open(outputroot+'.pkl', 'wb') as f:
        pickle.dump({'words': vocab, feature+'s': featurevocab, 'wordcounts': wordcounts, feature+'counts': featurecounts}, f)
    else:
      sys.exit()

  else:

    with open(corpusfile, 'r') as f:
      matrix, counts = symmetric_cooc_matrix((line.split() for line in f), vocab, unk=UNK, verbose=True, comm=comm)
			l_hypos = []
			for hypo_synset in synset.hyponyms():
				word = hypo_synset.name().split('.')[0]
				if word not in l_hypos:
					l_hypos.append(word)
			if l_hypos:
				random_index = random.randint(0, len(l_hypos) - 1)
				new_sentence.append(l_hypos[random_index])
			else:
				for w in word:
					new_sentence.append(w)
	print (' '.join(new_sentence))

if __name__ == "__main__":

	args = parse_command_line()

	l_sentence = semcor.sents()[args.index]
	sentence = ' '.join(l_sentence)

	print (sentence)

	s = semcor.tagged_sents(tag='sem')[args.index]
	# random.seed(a=0)

	if args.nym == 'synonym':
		print_synonym_sentence(s)
	elif args.nym == 'hypernym':
		print_hypernym_sentence(s)
	elif args.nym == 'hyponym':
		print_hyponym_sentence(s)
def process_semcor(ref_dict):
    '''
    Return a DataFrame that contrains sentences along with citations and information of detected heteronyms
    '''
    sents = semcor.sents()
    tagged_sents = semcor.tagged_sents(tag='sem')

    sense_list = list(ref_dict['sense'])
    semcor_sents = pd.DataFrame(columns=['sentence', 'citation', 'heteronym'])
    word_duplicate_sense = set(ref_dict[ref_dict.duplicated(['sense'
                                                             ])]['word'])

    for sent_idx, sent in enumerate(tagged_sents):
        het_in_sent = []
        for token_idx, token in enumerate(sent):

            if type(token) == nltk.Tree:
                lemma = token.label()
                chunk = token.leaves()

                ## Check whether token is a heteronym
                if (type(lemma) == nltk.corpus.reader.wordnet.Lemma) and (
                        lemma.synset() in sense_list) and (len(chunk) == 1):

                    synset = lemma.synset()
                    word = chunk[0]

                    ## Take care of sense-duplcated heteronyms (rare),
                    ## e.g. project and projects can have same sense but different pronunciations.
                    if word.lower() in word_duplicate_sense:
                        pron = list(ref_dict[(ref_dict['word'] == word.lower())
                                             & (ref_dict['sense'] == synset)]
                                    ['pronunciation'])
                        if pron:
                            het_in_sent.append((word.lower(), synset, pron[0]))

                    ## If sense if not duplicated, mapping to pron is one-to-one
                    else:
                        pron = list(ref_dict[ref_dict['sense'] == synset]
                                    ['pronunciation'])[0]
                        word_in_ref = list(
                            ref_dict[ref_dict['sense'] == synset]['word'])[0]
                        if word.lower() == word_in_ref:
                            het_in_sent.append((word_in_ref, synset, pron))

        if het_in_sent:
            new_row = {
                'sentence':
                "".join([
                    " " + i if not i.startswith("'")
                    and i not in string.punctuation else i
                    for i in sents[sent_idx]
                ]).strip(),
                'citation':
                'SemCor',
                'heteronym':
                het_in_sent
            }
            semcor_sents = semcor_sents.append(new_row, ignore_index=True)

    return semcor_sents
        return 'Noun'
    elif 'VB' in posToConvert:
        return 'Verb'
    elif posToConvert == 'RB':
        return 'Adverb'
    elif posToConvert == 'JJ':
        return 'Adjective'


semcorFileIds = semcor.fileids()

count = 0
wordSynsetCount = defaultdict(int)
wordCount = defaultdict(int)
for fileID in semcorFileIds:
    for tagged_sent in semcor.tagged_sents(fileID, 'both'):
        for tree in tagged_sent:
            if type(tree.label()) is Lemma:
                synset = tree.label().synset()
                for wordTuple in tree.pos():
                    wordPoS = convertPoS(wordTuple[1])
                    wordLowercase = wordTuple[0].lower()
                    wordCount[wordLowercase + " " + wordPoS] += 1
                    wordSynsets = wn.synsets(wordTuple[0])
                    if synset in wordSynsets:
                        wordSynsetCount[wordLowercase + " " + str(synset)] += 1

with open('semcorWordFreqCount', 'wb') as f:
    pickle.dump(wordCount, f)

with open('semcorWordSenseCount', 'wb') as f:
Exemple #27
0
def main2(filename):
    # csv fieldnames
    fieldnames = [
        'word', 'lemma', 'pos', 'is_homonym', 'wn_synset', 'ws_meaning',
        'confidence', 'nsenses'
    ]

    print('Reading {}...'.format(filename))
    outfile = open(os.path.join(OUTDIR, filename + '.tsv'), 'w')
    writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter='\t')
    writer.writeheader()

    for i, sent in enumerate(semcor.tagged_sents(tag='both')):

        if i % 100 == 0:
            print('{} sentences read'.format(i))

        for t in sent:
            if type(t) == str:
                # no lemma
                continue
            word = t.flatten()[0].lower()
            pos = t.pos()[0][1]
            if pos != 'NN' or not pos.startswith('V'):
                # t is not a noun or verb
                continue
            lemma = t.label().name()
            wn_synset = t.label().synset()
            nsenses = len(wn.synsets(word))

            def_bag = set([stem(w.lower()) for w in wn_synset.definition()])
            urls = homonym_urls(lemma)
            if urls == []:
                row['is_homonym'] = False
                #row['ws_meaning'] = 1
                #row['confidence'] = 1
            else:

                if pos == 'N':
                    pos_ = 'noun'
                elif pos.startswith('V'):
                    pos_ = 'verb'

                meaning_bags = [meaning_bag(pos_, url) for url in urls]
                confidence = []  # list of overlap size between bags
                for mb in meaning_bags:
                    confidence.append(len(mb & def_bag))

                row['is_homonym'] = True
                row['ws_meaning'] = confidence.index(max(confidence))
                row['confidence'] = '|'.join(str(x) for x in confidence)

            row['word'] = word
            row['lemma'] = lemma
            row['pos'] = pos
            row['wn_synset'] = wn_synset
            row['nsenses'] = nsenses

            writer.writerow(row)

    outfile.close()
		if 'NN' in posToConvert:
			return 'Noun'
		elif 'VB' in posToConvert:
			return 'Verb'
		elif posToConvert == 'RB':
			return 'Adverb'
		elif posToConvert == 'JJ':
			return 'Adjective'	

semcorFileIds = semcor.fileids()

count = 0
wordSynsetCount = defaultdict(int)
wordCount = defaultdict(int)
for fileID in semcorFileIds:
	for tagged_sent in semcor.tagged_sents(fileID, 'both'):
		for tree in tagged_sent:
			if type(tree.label()) is Lemma:
				synset = tree.label().synset()
				for wordTuple in tree.pos():
					wordPoS = convertPoS(wordTuple[1]) 
					wordLowercase = wordTuple[0].lower()
					wordCount[wordLowercase + " " + wordPoS] += 1
					wordSynsets = wn.synsets(wordTuple[0])
					if synset in wordSynsets:
						wordSynsetCount[wordLowercase + " " + str(synset)] += 1

with open('semcorWordFreqCount', 'wb') as f:
	pickle.dump(wordCount, f)

with open('semcorWordSenseCount', 'wb') as f:
from nltk.corpus import wordnet as wn
from nltk.corpus import semcor

wn_lemmas = set()
for lemma in wn.all_lemma_names(pos=wn.ADJ):
  wn_lemmas.add(lemma)

wn_adj_synsets = collections.defaultdict(set)

for word in wn_lemmas:
  for synset in wn.synsets(word, wn.ADJ):
    wn_adj_synsets[synset.name.lower()] = [lemma.lower() for lemma in synset.lemma_names ]

semcor_adjectives = set()
i = 0
for sent in semcor.tagged_sents(tag='both'):
  for c,chk in enumerate(sent):
    if chk.node and len(chk.node)>3 and chk.node[-3]=='.' and chk.node[-2:].isdigit() and chk[0].node.startswith('JJ'):
      if len(chk.leaves()) == 1:
        semcor_adjectives.add(chk.leaves()[0].lower())


semcor_synsets = set()
for s, words in wn_adj_synsets.items():
  for w in words:
    if w in semcor_adjectives:
      semcor_synsets.add(s.lower())

vectors = set()
vector_adj_file = open("data/VSM/eacl14-faruqui-en-svd-de-64.adj", "w")
for line in open("data/VSM/eacl14-faruqui-en-svd-de-64.adj.txt"):
Exemple #30
0
        elif ematrix[i][j] == 'SUB':
            seq.append(str(cmatrix[i][j] - cmatrix[i - 1][j - 1]))
            seq.append('SUB')
            i -= 1
            j -= 1
        elif ematrix[i][j] == 'INS':
            seq.append(str(cmatrix[i][j] - cmatrix[i][j - 1]))
            seq.append('INS')
            j -= 1
        elif ematrix[i][j] == 'DEL':
            seq.append(str(cmatrix[i][j] - cmatrix[i - 1][j]))
            seq.append('DEL')
            i -= 1
    seq = ' '.join(reversed(seq))
    print(seq)


if __name__ == '__main__':
    #...TODO Parse arguments and load semcor sentences
    args = parse_command_line()

    l_sentence1 = semcor.sents()[args.untagged]
    l_sentence2 = semcor.sents()[args.tagged]

    # TODO print sentence1 and sentence2
    print(' '.join(l_sentence1))
    print(' '.join(l_sentence2))

    s2 = semcor.tagged_sents(tag='sem')[args.tagged]

    wordnet_edit_distance(l_sentence1, s2, args.sim)
Exemple #31
0
        line = line.strip()
        if line != '':
            sent.append(line.split('\t'))
        else:
            if len(sent) > 0:
                supsense_sentences.append(sent)
                sent = []
        c += 1

print(supsense_sentences)
print(c)

# print(semcor.words())
# print(semcor.chunks())
i = 0
semcor_sents = semcor.tagged_sents(tag='both')
# print(semcor_sents)
with open('semcor_all.conll', 'w') as f:
    for sent in semcor_sents:
        if i == len(supsense_sentences):
            break
        ref_sent = supsense_sentences[i]
        for j, ch in enumerate(sent):
            # print(ch)
            # f.write(str(ch) + '\n')
            rt = ch.label()
            sense = 'O'
            if not isinstance(ch[0], str):
                sense = rt
                if isinstance(ch[0][0], str):
                    pos = ch[0].label()
from nltk.corpus import conll2000, conll2002
print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2000.chunked_sents()[:2]:
    print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2002.chunked_sents()[:2]:
    print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE


# SEMCOR
    
from nltk.corpus import semcor
print(semcor.words())
print(semcor.chunks())
print(semcor.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(semcor.chunk_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
list(map(str, semcor.tagged_chunks(tag='both')[:3]))
[[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]]    

# IEER

from nltk.corpus import ieer
ieer.fileids() # doctest: +NORMALIZE_WHITESPACE
docs = ieer.parsed_docs('APW_19980314')
print(docs[0])
print(docs[0].docno)
print(docs[0].doctype)
print(docs[0].date_time)
print(docs[0].headline)
print(docs[0].text) # doctest: +ELLIPSIS
Exemple #33
0
import nltk
from nltk.tree import Tree
from nltk.corpus.reader.wordnet import Lemma
from nltk.corpus import semcor
from nltk.corpus import wordnet

noun = set(['NN', 'NNS', 'NNP', 'NNPS'])
verb = set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])
adjective = set(['JJ', 'JJR', 'JJS'])
adverb = set(['RB', 'RBR', 'RBS'])
substantive = noun | verb | adjective | adverb

corp = semcor.sents()

tags = semcor.tagged_sents(tag = 'sem')

n = 0

correct = 0
base = 0
total = 0

for sent in corp:

    sentence =  ' '.join(sent)

    print sentence

    parsed = list(parser.parse(tokenizer.tokenize(sentence)))
        word = str(sentence_chunk[0])
        descriptor = str(sentence_chunk.label())

    if word in lemmas:
        lemmas[word][descriptor] = lemmas[word][descriptor] + 1 if descriptor in lemmas[word] else 1
    else:
        lemmas[word] = {descriptor: 1}  # this else statement prevents keyerror lookups on lemmas[word][synset]
    return word

print("Importing Lemma and Synsets")
lemmas = dict()
    # lemmas is a dict of dict,
    # lemmas[word] = dictionary of { synsets:frequency of synset when associated with a 'word' }
    # lemmas[word][synset] is a count of how many times a synset appears for each word
    # *** len(lemmas[word]) = the number of different senses a 'word' has in the corpus
taggedsentences = semcor.tagged_sents(tag='both')
    # all sentences, fully tagged from SEMCOR
plaintextsentences = semcor.sents()
    # all sentences from SEMCOR
targetsentences = {}
    # sentences containing 'point'
pos = dict()
    # list of part of speech tags from the corpus
max_sentence_len = 0
lemmacount = {}


# find all sentences including exactly 1 occurence of 'back'
# not all of these sentences are related to the synsets we are looking for
# e.g. goes back relates to the verb go instead of back
for i, s in enumerate(plaintextsentences) :
            seq.append(str(cmatrix[i][j] - cmatrix[i - 1][j - 1]))
            seq.append('SUB')
            i -= 1
            j -= 1
        elif ematrix[i][j] == 'INS':
            seq.append(str(cmatrix[i][j] - cmatrix[i][j - 1]))
            seq.append('INS')
            j -= 1
        elif ematrix[i][j] == 'DEL':
            seq.append(str(cmatrix[i][j] - cmatrix[i - 1][j]))
            seq.append('DEL')
            i -= 1
    seq = ' '.join(reversed(seq))
    print(seq)


if __name__ == '__main__':
    #...TODO Parse arguments and load semcor sentences
    args = parse_command_line()

    l_sentence1 = semcor.sents()[args.index1]
    l_sentence2 = semcor.sents()[args.index2]

    # TODO print sentence1 and sentence2
    print(' '.join(l_sentence1))
    print(' '.join(l_sentence2))

    s1 = semcor.tagged_sents(tag='sem')[args.index1]
    s2 = semcor.tagged_sents(tag='sem')[args.index2]

    wordnet_edit_distance(s1, s2, args.sim)