Ejemplo n.º 1
0
from codecs import open as codecs_open
from os.path import join, split
from hazm import word_tokenize, sent_tokenize, DadeganReader
from nltk.tree import Tree


open = lambda *args: codecs_open(*args, encoding='utf8')

mergeTags = True
dadeganDir = 'Resources/Dadegan-pages'
dadegan = DadeganReader('Resources/Dadegan/test.conll')
chunks = list(dadegan.chunked_trees())[200:300]
tsvFp = open('Resources/Dadegan-pages/003.tsv', 'w')
sorted_lines = sorted(open('Resources/Dadegan-pages/003.ann').readlines(), key=lambda l: int(l.split(' ')[1]))

def getCurrent_line():
	current_line = sorted_lines.pop(0)
	tab_parts = current_line.split('\t')
	_type, start, end = tab_parts[1].split(' ')
	start = int(start)
	end = int(end)
	text = tab_parts[2]
	return _type, start, end, text

global_index = 0
_type, start, end, text = getCurrent_line()
for i in range(100):
	chunk_tree = chunks[i]
	for chunk in chunk_tree:
		if type(chunk) is Tree:
			tokens = chunk.leaves()
Ejemplo n.º 2
0
from hazm import DadeganReader
from baaz import ChunkTreeInformationExtractor, DependencyTreeInformationExtractor
from hazm import Chunker, Normalizer, SequencePOSTagger, IOBTagger, DependencyParser, Lemmatizer
import codecs, re
from nltk import accuracy
from nltk.tree import Tree


#output = open('200DadeganSents-chunkExtractor.txt', 'w', encoding='utf8')
dadegan = DadeganReader('resources/Dadegan/test.conll')
tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model')
lemmatizer = Lemmatizer()
chunker = Chunker(model='Resources/chunker-dadeganFull.model')
parser = DependencyParser(tagger, lemmatizer=lemmatizer )
normalizer = Normalizer()
chunk_extractor = ChunkTreeInformationExtractor()
dep_extractor = DependencyTreeInformationExtractor()
trees = list(dadegan.chunked_trees())
chunk_trees = trees[:100] + trees[200:300]
trees = list(dadegan.trees())
dep_trees = trees[:100] + trees[200:300]
#dep_output = codecs.open('dep_output.txt', 'w', encoding='utf8')
#sentences = []
#for sent in dadegan.sents():
#	sentences.append(' '.join([w for w, t in sent]))
indices = []

def tag_sent(chunks, args):
	tagged_sent = []
	global_index = 0
Ejemplo n.º 3
0
	for arg in info:
		arg_list = []
		index = sent.strip().find(arg)
		if index >= 0:
			tokens = sent.split()
			for i in range(len(tokens)):
				index -= len(tokens[i]) + 1
				if index < 0:
					for c in range(i, i + len(arg.split())):
						arg_list.append(c)
					break
		info_list.append(arg_list)
	return info_list

input = codecs.open('200DadeganSents.txt', 'r', encoding='utf8')
dadegan = DadeganReader('Resources/Dadegan/train.conll')
dadegan_trees = dadegan.trees()
informations = []
sentences = []
for sentence in dadegan.sents():
	sentences.append(' '.join([w for w, t in sentence]))
for tree, chunks, sent in zip(dadegan_trees, dadegan.chunked_trees(), sentences):
	info_list = ([], [], [])
	for information in dependencyExtractor.extract(tree):
		temp_list = positions(information, sent)
		for i in range(3):
			if len(temp_list[i]) > 0 and temp_list[i] not in info_list[i]:
						info_list[i].append(temp_list[i])
	if [] in info_list:
		continue
	else:
Ejemplo n.º 4
0
from hazm import DadeganReader
from baaz import ChunkTreeInformationExtractor, DependencyTreeInformationExtractor
from hazm import Chunker, Normalizer, SequencePOSTagger, IOBTagger, DependencyParser, Lemmatizer
import codecs, re
from nltk import accuracy
from nltk.tree import Tree

#output = open('200DadeganSents-chunkExtractor.txt', 'w', encoding='utf8')
dadegan = DadeganReader('resources/Dadegan/test.conll')
tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model')
lemmatizer = Lemmatizer()
chunker = Chunker(model='Resources/chunker-dadeganFull.model')
parser = DependencyParser(tagger, lemmatizer=lemmatizer)
normalizer = Normalizer()
chunk_extractor = ChunkTreeInformationExtractor()
dep_extractor = DependencyTreeInformationExtractor()
trees = list(dadegan.chunked_trees())
chunk_trees = trees[:100] + trees[200:300]
trees = list(dadegan.trees())
dep_trees = trees[:100] + trees[200:300]
#dep_output = codecs.open('dep_output.txt', 'w', encoding='utf8')
#sentences = []
#for sent in dadegan.sents():
#	sentences.append(' '.join([w for w, t in sent]))
indices = []


def tag_sent(chunks, args):
    tagged_sent = []
    global_index = 0
    for chunk in chunks:
Ejemplo n.º 5
0
from codecs import open as codecs_open
from os.path import join, split
from hazm import word_tokenize, sent_tokenize, DadeganReader
from nltk.tree import Tree

open = lambda *args: codecs_open(*args, encoding='utf8')

mergeTags = True
dadeganDir = 'Resources/Dadegan-pages'
dadegan = DadeganReader('Resources/Dadegan/test.conll')
chunks = list(dadegan.chunked_trees())[200:300]
tsvFp = open('Resources/Dadegan-pages/003.tsv', 'w')
sorted_lines = sorted(open('Resources/Dadegan-pages/003.ann').readlines(),
                      key=lambda l: int(l.split(' ')[1]))


def getCurrent_line():
    current_line = sorted_lines.pop(0)
    tab_parts = current_line.split('\t')
    _type, start, end = tab_parts[1].split(' ')
    start = int(start)
    end = int(end)
    text = tab_parts[2]
    return _type, start, end, text


global_index = 0
_type, start, end, text = getCurrent_line()
for i in range(100):
    chunk_tree = chunks[i]
    for chunk in chunk_tree:
Ejemplo n.º 6
0
import codecs
from hazm import DadeganReader
from baaz import DependencyTreeInformationExtractor, ChunkTreeInformationExtractor


output = codecs.open('resources/informations.txt', 'w', encoding='utf8')
dadegan = DadeganReader('corpora/train.conll')
chunk_extractor = ChunkTreeInformationExtractor()
dependency_extractor = DependencyTreeInformationExtractor()

for chunk_tree, dependency_tree in zip(dadegan.chunked_trees(), dadegan.trees()):
	for information in chunk_extractor.extract(chunk_tree):
		print(*information, sep=' - ', file=output)
	print(file=output)
	for information in dependency_extractor.extract(dependency_tree):
		print(*information, sep=' + ', file=output)
	print(file=output)