def main():

    if cf.EMBEDDING_MODEL == "Elmo":
        raise Exception("Please use build_data_elmo instead.")
    #if cf.MODEL_TYPE == S2S:
    corpusReader = ConllCorpusReader(cf.DATA_FOLDER,
                                     [cf.TRAIN_FILENAME, cf.TEST_FILENAME],
                                     ['words', 'pos'])
    #elif cf.MODEL_TYPE == S21:
    #	corpusReader = TabbedCorpusReader(cf.DATA_FOLDER, [cf.TRAIN_FILENAME, cf.TEST_FILENAME])

    tagged_sents = corpusReader.tagged_sents()

    test_unique_wordtags, test_unique_chartags = get_unique_test_tag_set()

    logger.info("%d sentences loaded." % len(tagged_sents))
    #tagged_sents = clean_sentences(tagged_sents)
    #logger.info("%d sentences after cleaning (removing short/long sentences)." % len(tagged_sents))

    word_to_ix, ix_to_word, wtag_to_ix, ix_to_wtag = get_word_and_wordtag_ids(
        tagged_sents)  #, test_unique_wordtags)
    char_to_ix, ix_to_char, ctag_to_ix, ix_to_ctag = get_char_and_chartag_ids(
        tagged_sents)  #, test_unique_chartags)

    save_data_to_files(tagged_sents, word_to_ix, wtag_to_ix, ix_to_word,
                       ix_to_wtag, char_to_ix, ctag_to_ix, ix_to_char,
                       ix_to_ctag)

    if cf.USE_PRETRAINED_WORD_EMBEDDINGS:
        # Get all words in the embedding vocab
        emb_vocab = get_emb_vocab(cf.EMB_VEC_FILENAME)

        # Generate OOV embeddings for any words in ix_to_word that aren't in emb_vocab
        #generate_oov_embeddings(ix_to_word, emb_vocab, cf.EMB_BIN_FILENAME, cf.OOV_TOKENS_FILENAME, cf.EMB_OOV_FILENAME)

        # Combine OOV embeddings with IV embeddings and export them to a file
        export_trimmed_embedding_vectors(word_to_ix, cf.WORD_EMBEDDING_DIM,
                                         cf.EMB_OOV_FILENAME,
                                         cf.EMB_VEC_FILENAME,
                                         cf.EMB_TRIMMED_FILENAME,
                                         cf.WORD_EMBEDDING_DIM)

    if cf.USE_PRETRAINED_CHAR_EMBEDDINGS:
        char_emb_vocab = get_emb_vocab(cf.CHAR_EMB_VEC_FILENAME)
        generate_oov_embeddings(ix_to_char, char_emb_vocab,
                                cf.CHAR_EMB_BIN_FILENAME,
                                cf.CHAR_OOV_TOKENS_FILENAME,
                                cf.CHAR_EMB_OOV_FILENAME)
        export_trimmed_embedding_vectors(char_to_ix, cf.CHAR_EMBEDDING_DIM,
                                         cf.CHAR_EMB_OOV_FILENAME,
                                         cf.CHAR_EMB_VEC_FILENAME,
                                         cf.CHAR_EMB_TRIMMED_FILENAME,
                                         cf.CHAR_EMBEDDING_DIM)

    logger.info("Data building complete.")
Beispiel #2
0
 def read_turkish_corpus(self):
     tagged_sentences_raw = []
     conll_reader = ConllCorpusReader('path/to/languages-corpora',
                                      'path/to/turkish-pos-conll-file',
                                      ('words', 'pos'),
                                      encoding='UTF-8')
     tagged_sentences_raw_map = conll_reader.tagged_sents(
         'path/to/turkish-pos-conll-file')
     for sent in tagged_sentences_raw_map:
         tagged_sentences_raw.append(sent)
     tagged_sentences = [[(w.lower(), t) for (w, t) in s]
                         for s in tagged_sentences_raw]
     return tagged_sentences
def load_data_vslp2018(folder='../data/vlsp2018'):
    train_sents = ConllCorpusReader(
        folder, 'train.conll',
        ['words', 'pos', 'ignore', 'chunk']).iob_sents()
    test_sents = ConllCorpusReader(
        folder, 'test.conll', ['words', 'pos', 'ignore', 'chunk']).iob_sents()

    train_sents = [x for x in train_sents if len(x) > 0]
    test_sents = [x for x in test_sents if len(x) > 0]

    print("#train_sents", len(train_sents))
    print("#test_sents", len(test_sents))
    return train_sents, test_sents
Beispiel #4
0
def load_data_conll2003():
    train_sents = ConllCorpusReader(
        '../data/conll2003', 'train.txt',
        ['words', 'pos', 'ignore', 'chunk']).iob_sents()
    test_sents = ConllCorpusReader(
        '../data/conll2003', 'valid.txt',
        ['words', 'pos', 'ignore', 'chunk']).iob_sents()

    train_sents = [x for x in train_sents if len(x) > 0]
    test_sents = [x for x in test_sents if len(x) > 0]

    print("#train_sents", len(train_sents))
    print("#test_sents", len(test_sents))
    return train_sents, test_sents
Beispiel #5
0
 def _get_corpus(self, file_path: str) -> ConllCorpusReader:
     path = PurePath(file_path)
     return ConllCorpusReader(
         root=str(path.parents[0]),
         fileids=str(path.name),
         columntypes=["words", "pos", "ignore", "chunk"],
     )
Beispiel #6
0
 def srl_instances(self,
                   fileids=None,
                   categories=None,
                   pos_in_tree=None,
                   flatten=True):
     return ConllCorpusReader.srl_instances(
         self, self._resolve(fileids, categories), pos_in_tree, flatten)
def main():

	corpusReader = ConllCorpusReader(cf.DATA_FOLDER, [cf.TRAIN_FILENAME, cf.TEST_FILENAME], ['words', 'pos'])

	tagged_sents = corpusReader.tagged_sents()

	test_unique_wordtags, test_unique_chartags = get_unique_test_tag_set()

	logger.info("%d sentences loaded." % len(tagged_sents))	



	ix_to_word, word_to_ix, wtag_to_ix, ix_to_wtag, embedding_vectors = get_ids_and_elmo_embeddings(tagged_sents) 

	char_to_ix, ix_to_char, ctag_to_ix, ix_to_ctag = get_char_and_chartag_ids(tagged_sents)

	save_data_to_files(tagged_sents, word_to_ix, wtag_to_ix, ix_to_word, ix_to_wtag, char_to_ix, ctag_to_ix, ix_to_char, ix_to_ctag)

	logger.info("Data building complete.")
Beispiel #8
0
def load_datasets(word_to_ix, wtag_to_ix, char_to_ix, ctag_to_ix, ix_to_char, ix_to_word):
	data_iterators = { "train": None, "dev": None }
	test_dataset = []
	word_index = 1 # Used for elmo models only
	for i, dataset in enumerate(["train", "test"]):

		#if cf.MODEL_TYPE == S2S:
		corpusReader = ConllCorpusReader(cf.DATA_FOLDER, [[cf.TRAIN_FILENAME, cf.TEST_FILENAME][i]], ['words', 'pos'])
		#elif cf.MODEL_TYPE == S21:
		#	corpusReader = TabbedCorpusReader(cf.DATA_FOLDER, [[cf.TRAIN_FILENAME, cf.TEST_FILENAME][i]])

		tagged_sents = corpusReader.tagged_sents()
		data_w, data_x, data_y, data_f, rejected_sents, rejected_words, filtered_words, non_alphabetical_words, rejected_tags, word_index = tagged_sents_to_numpy(tagged_sents, word_to_ix, wtag_to_ix, char_to_ix, ctag_to_ix, ix_to_char, ix_to_word, dataset, word_index)
		if cf.WORD_LEVEL_WITH_FLAGGER:
			myDataset = MyDatasetWithFlags(data_w, data_x, data_y, data_f)
		else:
			myDataset = MyDataset(data_w, data_x, data_y)

		data_iterator = DataLoader(myDataset, batch_size=cf.BATCH_SIZE, pin_memory=True)
		data_iterators[dataset] = data_iterator
			#for d in data_iterator:
		#		torch.set_printoptions(threshold = 5000000)
	#			print d 
	#			exit()
		logger.info("Loaded %d %s batches.\n" % (len(data_iterator), dataset) +
			"      (%d x %d = ~%d %s total)" % (len(data_iterator), cf.BATCH_SIZE, len(data_iterator) * cf.BATCH_SIZE, "words" if cf.GRANULARITY in [CHAR_LEVEL, CHAR_AND_WORD_LEVEL] else "sentences"))
		if len(rejected_sents) > 0:
			logger.warning("%d of %d sentences from the %s set were trimmed due to being too long or short." % (len(rejected_sents), len(tagged_sents) + len(rejected_sents), dataset))
		if len(rejected_words) > 0:
			logger.warning("%d words from the %s set were trimmed due to being too long." % (len(rejected_words), dataset))
		if len(rejected_tags) > 0:
			logger.warning("%d labels from the %s set were trimmed due to being too long." % (len(rejected_tags), dataset))
		if len(filtered_words) > 0:
			logger.info("%d words were filtered from the %s set due to beginning with undesirable character sequences." % (len(filtered_words), dataset))
		if len(non_alphabetical_words) > 0:
			logger.info("%d words were filtered from the %s set due to being entirely non-alphabetical." % (len(non_alphabetical_words), dataset))
	return data_iterators
Beispiel #9
0
def main() -> None:
    """Точка входа в приложение."""
    corpus_root = Path('corpus')
    # Настроим логирование результатов
    global _logger
    setup_logger(_logger, corpus_root / 'collocations.log')

    # Загрузим стоп-слова
    nltk.download('stopwords', '.env/share/nltk_data')
    stop_words = set(stopwords.words('russian'))

    # Импортируем корпус
    tags_root = corpus_root / 'pos_tagging'
    reader = ConllCorpusReader(
        str(tags_root), [f.name for f in tags_root.glob('*.tags')],
        columntypes=['words', 'ignore', 'ignore', 'ignore', 'pos'],
        separator='\t')
    _logger.info('Документов: %d', len(reader.fileids()))
    _logger.info('Токенов в первом документе (%s): %d',
                 reader.fileids()[0], len(reader.words(reader.fileids()[0])))

    _logger.info('Загружаем предложения')
    sentences = reader.sents()

    # Строим таблицы сопряжённости для всех слов в корпусе
    _logger.info('Считаем таблицу сопряжённости по всем словам')
    bigram_finder = BigramCollocationFinder.from_documents(
        [w.lower() for w in sent] for sent in tqdm(sentences))
    _logger.info('Всего биграм: %d', bigram_finder.N)

    print_samples(bigram_finder)

    # А теперь отфильтруем по частоте и удалим пунктуацию, стоп-слова
    _logger.info(
        'Отфильтруем пунктуацию, стоп-слова и установим предел по частоте')
    bigram_finder.apply_freq_filter(5)
    bigram_finder.apply_word_filter(lambda w: len(w) < 3 or w in stop_words)
    _logger.info('Всего биграм: %d', bigram_finder.N)
    print_samples(bigram_finder)
def get_unique_test_tag_set():

    logger.info("Building set of testset-unique tags...")

    corpusReaderTrain = ConllCorpusReader(cf.DATA_FOLDER, [cf.TRAIN_FILENAME],
                                          ['words', 'pos'])
    corpusReaderTest = ConllCorpusReader(cf.DATA_FOLDER, [cf.TEST_FILENAME],
                                         ['words', 'pos'])

    tagged_sents_train = corpusReaderTrain.tagged_sents()
    tagged_sents_test = corpusReaderTest.tagged_sents()

    train_wordtags = set()
    train_chartags = set()
    for sent in tagged_sents_train:
        for word, tag in sent:
            if tag != "<PAD>" and tag != "<SELF>":
                train_wordtags.add(tag)
                for char in tag:
                    train_chartags.add(char)

    test_unique_wordtags = set()
    test_unique_chartags = set()
    for sent in tagged_sents_test:
        for word, tag in sent:
            if tag != "<PAD>" and tag != "<SELF>":
                if tag not in train_wordtags:
                    test_unique_wordtags.add(tag)
                    for char in tag:
                        if char not in train_chartags:
                            test_unique_chartags.add(char)

    logger.info(
        "%d unique word tags and %d unique char tags found in the test dataset."
        % (len(test_unique_wordtags), len(test_unique_chartags)))
    return test_unique_wordtags, test_unique_chartags
Beispiel #11
0
 def parsed_sents(self, fileids=None, categories=None, pos_in_tree=None):
     return ConllCorpusReader.parsed_sents(
         self, self._resolve(fileids, categories), pos_in_tree)
Beispiel #12
0
 def chunked_sents(self, fileids=None, categories=None, chunk_types=None):
     return ConllCorpusReader.chunked_sents(
         self, self._resolve(fileids, categories), chunk_types)
	def parsed_sents(self, fileids=None, categories=None, pos_in_tree=None):
		return ConllCorpusReader.parsed_sents(
			self, self._resolve(fileids, categories), pos_in_tree)
	def chunked_sents(self, fileids=None, categories=None, chunk_types=None):
		return ConllCorpusReader.chunked_sents(
			self, self._resolve(fileids, categories), chunk_types)
	def tagged_sents(self, fileids=None, categories=None):
		return ConllCorpusReader.tagged_sents(self, self._resolve(fileids, categories))
for i in range(0,len(list_train)):
    lst.append(list_train[i].split())


lst_train = []
for i in range(0,len(lst)):
    lst2 = []
    for j in range(0,len(lst[i])):
        lst2.append(lst[i][j].split('|'))
    lst_train.append(lst2)

print('Train Set Readed')

## Reads the WikiGold Test Set

reader = ConllCorpusReader('/home/nicor/Documents','.conll',('words','pos'))
list_test = reader.tagged_sents('wikigold.txt')

lst_test = []
for i in range(0,len(list_test)):
    lst1 = []
    lst2 = []
    lst3 = []
    for j in range(0, len(list_test[i])):
        lst1.append(list_test[i][j][0])
    list2 = nltk.pos_tag(lst1)
    for j in range(0, len(list2)):
        lst3.append([list2[j][0], list2[j][1], list_test[i][j][1]])
    lst_test.append(lst3)

### Defines the Features to be obtained from every sentence
Beispiel #17
0
 def tagged_words(self, fileids=None, categories=None):
     return ConllCorpusReader.tagged_words(
         self, self._resolve(fileids, categories))
	def srl_spans(self, fileids=None, categories=None):
		return ConllCorpusReader.srl_spans(self, self._resolve(fileids, categories))
Beispiel #19
0
from __future__ import division
from nltk.corpus.reader import ConllCorpusReader
from nltk.probability import FreqDist, DictionaryProbDist, LaplaceProbDist, SimpleGoodTuringProbDist, MLEProbDist

conllreader = ConllCorpusReader(".", "de-train.tt", ('words', 'pos'))  # getting a train corpus from file
states = ('VERB', 'NOUN', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', '.')  # list of 12 POS tags
sentslen = len(conllreader.tagged_sents())  # getting number of sentences

tagfdist = FreqDist(pair[1] for pair in conllreader.tagged_words())   # getting frequence of (word,tag)

firsttagfdist = FreqDist(pair[0][1] for pair in conllreader.tagged_sents())  # getting frequence of first tags
A0j = DictionaryProbDist(dict(map(lambda (k, x): (k, x/sentslen), firsttagfdist.iteritems())))
A0jLap = LaplaceProbDist(firsttagfdist)
A0jGT = SimpleGoodTuringProbDist(firsttagfdist)
A0jMLE = MLEProbDist(firsttagfdist)

TagPair = []
words = conllreader.tagged_words()
for i in range(0, len(words)-1):
    TagPair.append((words[i][1], words[i+1][1]))

TagPairfdist = FreqDist(TagPair)
Aij = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[0])), TagPairfdist.iteritems())))
AijLap = LaplaceProbDist(TagPairfdist)
AijGT = SimpleGoodTuringProbDist(TagPairfdist)
AijMLE = MLEProbDist(TagPairfdist)

TagWordfdist = FreqDist(conllreader.tagged_words())
Biw = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[1])), TagWordfdist.iteritems())))
BiwLap = LaplaceProbDist(TagWordfdist)
BiwGT = SimpleGoodTuringProbDist(TagWordfdist)
Beispiel #20
0
a = {}


## Function to add an adjective to a noun key
def add_adj(noun_param, adj_param):
    if (noun_param in a):
        a[noun_param].append(adj_param)
    else:
        a[noun_param] = [adj_param]


filedir = '/Users/fnascime/Documents/Sicily_Project/texts/'
filename = 'ilgattopardo_prima'

mycorpus = ConllCorpusReader(filedir, filename + '.conll',
                             ('ignore', 'words', 'ignore', 'pos', 'ignore',
                              'ignore', 'ignore', 'ignore'))

words = mycorpus.tagged_words()
list_len = len(words)

## Loop through file and retrieve adjetives directly associated with nouns (adjunct words)
for i in range(list_len):

    if (words[i][1] == 'S'):
        if ((i > 0) and (words[i - 1][1] == 'A')):
            add_adj(words[i][0], words[i - 1][0])
        elif ((i < list_len - 1) and (words[i + 1][1] == 'A')):
            add_adj(words[i][0], words[i + 1][0])

## Loop throught the list of words and verify the ones with more adjective
Beispiel #21
0
 def iob_words(self, fileids=None, categories=None, pos_in_tree=None):
   return ConllCorpusReader.iob_words(self, self._resolve(fileids, categories))
Beispiel #22
0
from __future__ import division
from nltk.corpus.reader import ConllCorpusReader
import Train

conllreader = ConllCorpusReader(".", "de-test.t", ('words', 'pos'))
states = Train.states

def viterbi(obs, states, start_p, trans_p, emit_p):
    V = [{}]
    path = {}
    # Initialize base cases (t == 0)
    for y in states:
        if sum(emit_p.prob((obs[0], y1)) for y1 in states) != 0:
            V[0][y] = start_p.logprob(y) + emit_p.logprob((obs[0], y))
        else:
            V[0][y] = start_p.logprob(y)
        path[y] = [y]

    # Run Viterbi for t > 0
    for t in range(1, len(obs)):
        V.append({})
        newpath = {}

        for y in states:
            if sum(emit_p.prob((obs[t], y1)) for y1 in states) != 0:
                (prob, state) = max((V[t-1][y0] + trans_p.logprob((y0, y)) + emit_p.logprob((obs[t], y)), y0) for y0 in states)
            else:
                (prob, state) = max((V[t-1][y0] + trans_p.logprob((y0, y)), y0) for y0 in states)
            V[t][y] = prob
            newpath[y] = path[state] + [y]
        # Don't need to remember the old paths
        def get_iob_words(grid):
            return self._get_iob_words(grid, tagset)

        return LazyMap(get_iob_words, self._grids(fileids))

    def _get_iob_words(self, grid, tagset=None):
        pos_tags = self._get_column(grid, self._colmap['pos'])
        if tagset and tagset != self._tagset:
            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
        return list(
            zip(self._get_column(grid, self._colmap['words']), pos_tags,
                self._get_column(grid, self._colmap['ne'])))


bject = ConllCorpusReader("/home/subham", 'train_ner.txt',
                          ('words', 'pos', 'chunk'), ('NP_B', 'PP', 'VP'))
train_sents = bject.iob_sents('train_ner.txt')
bject1 = ConllCorpusReader("/home/subham", 'test_accuracy.txt',
                           ('words', 'pos', 'chunk'), ('NP_B', 'PP', 'VP'))
#train_sents=bject.iob_sents('conll.txt')

test_sents = bject1.iob_sents('test_accuracy.txt')
#train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
#test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))
#print(test_sents[0])
#print(train_sents[0])


def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
	def srl_instances(self, fileids=None, categories=None, pos_in_tree=None, flatten=True):
		return ConllCorpusReader.srl_instances(
			self, self._resolve(fileids, categories), pos_in_tree, flatten)
Beispiel #25
0
 def sents(self, fileids=None, categories=None):
     return ConllCorpusReader.sents(self,
                                    self._resolve(fileids, categories))
	def words(self, fileids=None, categories=None):
		return ConllCorpusReader.words(self, self._resolve(fileids, categories))
Beispiel #27
0
from __future__ import division
from nltk.corpus.reader import ConllCorpusReader
import Train

conllreader = ConllCorpusReader(".", "de-test.t", ('words', 'pos'))
states = Train.states


def viterbi(obs, states, start_p, trans_p, emit_p):
    V = [{}]
    path = {}
    # Initialize base cases (t == 0)
    for y in states:
        if sum(emit_p.prob((obs[0], y1)) for y1 in states) != 0:
            V[0][y] = start_p.logprob(y) + emit_p.logprob((obs[0], y))
        else:
            V[0][y] = start_p.logprob(y)
        path[y] = [y]

    # Run Viterbi for t > 0
    for t in range(1, len(obs)):
        V.append({})
        newpath = {}

        for y in states:
            if sum(emit_p.prob((obs[t], y1)) for y1 in states) != 0:
                (prob, state) = max((V[t - 1][y0] + trans_p.logprob((y0, y)) +
                                     emit_p.logprob((obs[t], y)), y0)
                                    for y0 in states)
            else:
                (prob, state) = max(
Beispiel #28
0
# Copyright
# https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb
from itertools import chain

import pycrfsuite
from nltk.corpus.reader import ConllCorpusReader
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer

train = ConllCorpusReader("datasets/conll2003", "eng.train",
                          ["words", "pos", "ignore", "chunk"])
test = ConllCorpusReader("datasets/conll2003", "eng.testb",
                         ["words", "pos", "ignore", "chunk"])

train_sents = list(train.iob_sents())
test_sents = list(test.iob_sents())


def word2features(sent, i):
    # remove postag
    word = sent[i][0]
    # postag = sent[i][1]
    features = [
        "bias",
        "word.lower=" + word.lower(),
        "word[-3:]=" + word[-3:],
        "word[-2:]=" + word[-2:],
        "word.isupper=%s" % word.isupper(),
        "word.istitle=%s" % word.istitle(),
        "word.isdigit=%s" % word.isdigit(),
        # 'postag=' + postag,