class InformationExtractor():
    def __init__(self):
        self.uninformatives = set(
            sum([
                line.split(' - ') for line in open(
                    'data/adverbs.dat', encoding='utf8').read().split('\n')
                if line.strip() and not line.startswith('#')
            ], []))
        self.normalizer = Normalizer()
        self.tagger = POSTagger(model='resources/postagger.model')
        self.parser = DependencyParser(tagger=self.tagger,
                                       lemmatizer=Lemmatizer())
        self.extractor = DependencyTreeInformationExtractor()

    def analyze(self, text):
        sentences = [
            sentence
            for sentence in sent_tokenize(self.normalizer.normalize(text))
            if len(sentence) > 15
        ]
        if not sentences:
            return []

        for sentence, tree in zip(
                sentences,
                self.parser.parse_sents(map(word_tokenize, sentences))):
            for information in self.extractor.extract(tree):
                if information[1] not in self.uninformatives:
                    yield information
Example #2
0
class InformationExtractor():

	def __init__(self):
		self.uninformatives = set(sum([line.split(' - ') for line in open('data/adverbs.dat', encoding='utf8').read().split('\n') if line.strip() and not line.startswith('#')], []))
		self.normalizer = Normalizer()
		self.tagger = POSTagger(model='resources/postagger.model')
		self.parser = DependencyParser(tagger=self.tagger, lemmatizer=Lemmatizer())
		self.extractor = DependencyTreeInformationExtractor()

	def analyze(self, text):
		sentences = [sentence for sentence in sent_tokenize(self.normalizer.normalize(text)) if len(sentence) > 15]
		if not sentences:
			return []

		for sentence, tree in zip(sentences, self.parser.parse_sents(map(word_tokenize, sentences))):
			for information in self.extractor.extract(tree):
				if information[1] not in self.uninformatives:
					yield information
Example #3
0
persica = PersicaReader(csv_file='corpora/persica.csv')
uninformatives = set(sum([line.split(' - ') for line in codecs.open('data/adverbs.dat', encoding='utf8').read().split('\n') if line.strip() and not line.startswith('#')], []))


normalizer = Normalizer()
tagger = POSTagger(model='resources/postagger.model')
parser = TurboParser(tagger=tagger, lemmatizer=Lemmatizer(), model_file='resources/turboparser.model')
extractor = DependencyTreeInformationExtractor()


informations = codecs.open('resources/informations.txt', 'a+', encoding='utf8')
processed_sentences = set([line.strip()[2:] for line in informations if line.startswith('#')])


for text in chain(hamshahri.texts(), persica.texts()):
	try:
		sentences = [sentence for sentence in sent_tokenize(normalizer.normalize(text)) if len(sentence) > 15 and sentence not in processed_sentences]
		if not sentences:
			continue

		for sentence, tree in zip(sentences, parser.parse_sents(map(word_tokenize, sentences))):
			print('#', sentence, file=informations)

			for information in extractor.extract(tree):
				if information[1] not in uninformatives:
					print(*information, sep=' - ', file=informations)
			print(file=informations)

	except Exception as error:
		print(error, 'while prcoessing:', *sentences, sep='\n')
Example #4
0
sentences = []
evaluation_sents = []
for gold_sent in gold:
	sentences.append([w for w, t, c, l in gold_sent])
#tokens = tagger.tag_sents(sentences)
#chunk_trees = list(chunker.parse_sents(tokens))
#dep_trees = parser.parse_sents(sentences)
dep_tagged_sents = []
chunk_tagged_sents = []
for number, gold_sent in enumerate(gold):

	sentence = ' '.join(sentences[number])
	chunk_tree = chunk_trees[number]
	dep_tree = dep_trees[number]
	chunk_informations = list(chunk_extractor.extract(chunk_tree))
	dep_informations = list(dep_extractor.extract(dep_tree))
	evaluation_sent = [(w, l) for w, t, c, l in gold_sent]
	dep_tagged_sent = [(w,l) for w, t, c, l in [tokens for tokens in info2iob(sentence, chunk_tree, dep_informations)]]
	chunk_tagged_sent = [(w,l) for w, t, c, l in [tokens for tokens in info2iob(sentence, chunk_tree, chunk_informations)]]
	if len(evaluation_sent) == len(dep_tagged_sent):
		evaluation_sents.append(evaluation_sent)
		dep_tagged_sents.append(dep_tagged_sent)
		chunk_tagged_sents.append(chunk_tagged_sent)
	else:
		print(chunk_tagged_sent)
		print()
print('dependency accuracy: %f' % (accuracy(sum(evaluation_sents, []), sum(dep_tagged_sents, []))))
print('chunk accuracy: %f' % (accuracy(sum(evaluation_sents, []), sum(chunk_tagged_sents, []))))

information_tagger = IOBTagger(model='informations-all.model')
print(information_tagger.evaluate(gold))
Example #5
0
					for c in range(i, i + len(arg.split())):
						arg_list.append(c)
					break
		info_list.append(arg_list)
	return info_list

input = codecs.open('200DadeganSents.txt', 'r', encoding='utf8')
dadegan = DadeganReader('Resources/Dadegan/train.conll')
dadegan_trees = dadegan.trees()
informations = []
sentences = []
for sentence in dadegan.sents():
	sentences.append(' '.join([w for w, t in sentence]))
for tree, chunks, sent in zip(dadegan_trees, dadegan.chunked_trees(), sentences):
	info_list = ([], [], [])
	for information in dependencyExtractor.extract(tree):
		temp_list = positions(information, sent)
		for i in range(3):
			if len(temp_list[i]) > 0 and temp_list[i] not in info_list[i]:
						info_list[i].append(temp_list[i])
	if [] in info_list:
		continue
	else:
		tag_sent(chunks, info_list)

"""
for line in input.readlines():
	if len(re.findall(r'^\d+-', line)) == 0:
		if len(line) > 1:
			informations.append(line.replace('\n', '').split(' + '))
		else:
Example #6
0
sentences = []
evaluation_sents = []
for gold_sent in gold:
    sentences.append([w for w, t, c, l in gold_sent])
#tokens = tagger.tag_sents(sentences)
#chunk_trees = list(chunker.parse_sents(tokens))
#dep_trees = parser.parse_sents(sentences)
dep_tagged_sents = []
chunk_tagged_sents = []
for number, gold_sent in enumerate(gold):

    sentence = ' '.join(sentences[number])
    chunk_tree = chunk_trees[number]
    dep_tree = dep_trees[number]
    chunk_informations = list(chunk_extractor.extract(chunk_tree))
    dep_informations = list(dep_extractor.extract(dep_tree))
    evaluation_sent = [(w, l) for w, t, c, l in gold_sent]
    dep_tagged_sent = [(w, l) for w, t, c, l in [
        tokens for tokens in info2iob(sentence, chunk_tree, dep_informations)
    ]]
    chunk_tagged_sent = [(w, l) for w, t, c, l in [
        tokens for tokens in info2iob(sentence, chunk_tree, chunk_informations)
    ]]
    if len(evaluation_sent) == len(dep_tagged_sent):
        evaluation_sents.append(evaluation_sent)
        dep_tagged_sents.append(dep_tagged_sent)
        chunk_tagged_sents.append(chunk_tagged_sent)
    else:
        print(chunk_tagged_sent)
        print()
print('dependency accuracy: %f' %
Example #7
0
normalizer = Normalizer()
tagger = POSTagger(model='resources/postagger.model')
# parser = TurboParser(tagger=tagger, lemmatizer=Lemmatizer(), model_file='resources/turboparser.model')
parser = DependencyParser(tagger=tagger, lemmatizer=Lemmatizer())
extractor = DependencyTreeInformationExtractor()


informations = codecs.open('resources/informations.txt', 'a+', encoding='utf8')
processed_sentences = set([line.strip()[2:] for line in informations if line.startswith('#')])


errors = codecs.open('resources/errors.txt', 'w', encoding='utf8')

for text in chain(hamshahri.texts(), persica.texts()):
	try:
		sentences = [sentence for sentence in sent_tokenize(normalizer.normalize(text)) if len(sentence) > 15 and sentence not in processed_sentences]
		if not sentences:
			continue

		for sentence, tree in zip(sentences, parser.parse_sents(map(word_tokenize, sentences))):
			print('#', sentence, file=informations)

			for information in extractor.extract(tree):
				if information[1] not in uninformatives:
					print(*information, sep=' - ', file=informations)
			print(file=informations)

	except Exception as error:
		print(error, 'while prcoessing:', *sentences, sep='\n', file=errors)
Example #8
0
import codecs
from hazm import DadeganReader
from baaz import DependencyTreeInformationExtractor, ChunkTreeInformationExtractor


output = codecs.open('resources/informations.txt', 'w', encoding='utf8')
dadegan = DadeganReader('corpora/train.conll')
chunk_extractor = ChunkTreeInformationExtractor()
dependency_extractor = DependencyTreeInformationExtractor()

for chunk_tree, dependency_tree in zip(dadegan.chunked_trees(), dadegan.trees()):
	for information in chunk_extractor.extract(chunk_tree):
		print(*information, sep=' - ', file=output)
	print(file=output)
	for information in dependency_extractor.extract(dependency_tree):
		print(*information, sep=' + ', file=output)
	print(file=output)