Ejemplo n.º 1
0
from nltk import accuracy
from nltk.tree import Tree


#output = open('200DadeganSents-chunkExtractor.txt', 'w', encoding='utf8')
dadegan = DadeganReader('resources/Dadegan/test.conll')
tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model')
lemmatizer = Lemmatizer()
chunker = Chunker(model='Resources/chunker-dadeganFull.model')
parser = DependencyParser(tagger, lemmatizer=lemmatizer )
normalizer = Normalizer()
chunk_extractor = ChunkTreeInformationExtractor()
dep_extractor = DependencyTreeInformationExtractor()
trees = list(dadegan.chunked_trees())
chunk_trees = trees[:100] + trees[200:300]
trees = list(dadegan.trees())
dep_trees = trees[:100] + trees[200:300]
#dep_output = codecs.open('dep_output.txt', 'w', encoding='utf8')
#sentences = []
#for sent in dadegan.sents():
#	sentences.append(' '.join([w for w, t in sent]))
indices = []

def tag_sent(chunks, args):
	tagged_sent = []
	global_index = 0
	for chunk in chunks:
		if type(chunk) is Tree:
			tokens = chunk.leaves()
		else:
			tokens = [chunk]
Ejemplo n.º 2
0
import codecs, re
from nltk import accuracy
from nltk.tree import Tree

#output = open('200DadeganSents-chunkExtractor.txt', 'w', encoding='utf8')
dadegan = DadeganReader('resources/Dadegan/test.conll')
tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model')
lemmatizer = Lemmatizer()
chunker = Chunker(model='Resources/chunker-dadeganFull.model')
parser = DependencyParser(tagger, lemmatizer=lemmatizer)
normalizer = Normalizer()
chunk_extractor = ChunkTreeInformationExtractor()
dep_extractor = DependencyTreeInformationExtractor()
trees = list(dadegan.chunked_trees())
chunk_trees = trees[:100] + trees[200:300]
trees = list(dadegan.trees())
dep_trees = trees[:100] + trees[200:300]
#dep_output = codecs.open('dep_output.txt', 'w', encoding='utf8')
#sentences = []
#for sent in dadegan.sents():
#	sentences.append(' '.join([w for w, t in sent]))
indices = []


def tag_sent(chunks, args):
    tagged_sent = []
    global_index = 0
    for chunk in chunks:
        if type(chunk) is Tree:
            tokens = chunk.leaves()
        else:
Ejemplo n.º 3
0
		arg_list = []
		index = sent.strip().find(arg)
		if index >= 0:
			tokens = sent.split()
			for i in range(len(tokens)):
				index -= len(tokens[i]) + 1
				if index < 0:
					for c in range(i, i + len(arg.split())):
						arg_list.append(c)
					break
		info_list.append(arg_list)
	return info_list

input = codecs.open('200DadeganSents.txt', 'r', encoding='utf8')
dadegan = DadeganReader('Resources/Dadegan/train.conll')
dadegan_trees = dadegan.trees()
informations = []
sentences = []
for sentence in dadegan.sents():
	sentences.append(' '.join([w for w, t in sentence]))
for tree, chunks, sent in zip(dadegan_trees, dadegan.chunked_trees(), sentences):
	info_list = ([], [], [])
	for information in dependencyExtractor.extract(tree):
		temp_list = positions(information, sent)
		for i in range(3):
			if len(temp_list[i]) > 0 and temp_list[i] not in info_list[i]:
						info_list[i].append(temp_list[i])
	if [] in info_list:
		continue
	else:
		tag_sent(chunks, info_list)
Ejemplo n.º 4
0
import codecs
from hazm import DadeganReader
from baaz import DependencyTreeInformationExtractor, ChunkTreeInformationExtractor


output = codecs.open('resources/informations.txt', 'w', encoding='utf8')
dadegan = DadeganReader('corpora/train.conll')
chunk_extractor = ChunkTreeInformationExtractor()
dependency_extractor = DependencyTreeInformationExtractor()

for chunk_tree, dependency_tree in zip(dadegan.chunked_trees(), dadegan.trees()):
	for information in chunk_extractor.extract(chunk_tree):
		print(*information, sep=' - ', file=output)
	print(file=output)
	for information in dependency_extractor.extract(dependency_tree):
		print(*information, sep=' + ', file=output)
	print(file=output)