from hazm import DadeganReader
from baaz import ChunkTreeInformationExtractor, DependencyTreeInformationExtractor
from hazm import Chunker, Normalizer, SequencePOSTagger, IOBTagger, DependencyParser, Lemmatizer
import codecs, re
from nltk import accuracy
from nltk.tree import Tree


#output = open('200DadeganSents-chunkExtractor.txt', 'w', encoding='utf8')
dadegan = DadeganReader('resources/Dadegan/test.conll')
tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model')
lemmatizer = Lemmatizer()
chunker = Chunker(model='Resources/chunker-dadeganFull.model')
parser = DependencyParser(tagger, lemmatizer=lemmatizer )
normalizer = Normalizer()
chunk_extractor = ChunkTreeInformationExtractor()
dep_extractor = DependencyTreeInformationExtractor()
trees = list(dadegan.chunked_trees())
chunk_trees = trees[:100] + trees[200:300]
trees = list(dadegan.trees())
dep_trees = trees[:100] + trees[200:300]
#dep_output = codecs.open('dep_output.txt', 'w', encoding='utf8')
#sentences = []
#for sent in dadegan.sents():
#	sentences.append(' '.join([w for w, t in sent]))
indices = []

def tag_sent(chunks, args):
	tagged_sent = []
	global_index = 0
	for chunk in chunks:
Exemple #2
0
from hazm import DadeganReader
from baaz import ChunkTreeInformationExtractor, DependencyTreeInformationExtractor
from hazm import Chunker, Normalizer, SequencePOSTagger, IOBTagger, DependencyParser, Lemmatizer
import codecs, re
from nltk import accuracy
from nltk.tree import Tree

#output = open('200DadeganSents-chunkExtractor.txt', 'w', encoding='utf8')
dadegan = DadeganReader('resources/Dadegan/test.conll')
tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model')
lemmatizer = Lemmatizer()
chunker = Chunker(model='Resources/chunker-dadeganFull.model')
parser = DependencyParser(tagger, lemmatizer=lemmatizer)
normalizer = Normalizer()
chunk_extractor = ChunkTreeInformationExtractor()
dep_extractor = DependencyTreeInformationExtractor()
trees = list(dadegan.chunked_trees())
chunk_trees = trees[:100] + trees[200:300]
trees = list(dadegan.trees())
dep_trees = trees[:100] + trees[200:300]
#dep_output = codecs.open('dep_output.txt', 'w', encoding='utf8')
#sentences = []
#for sent in dadegan.sents():
#	sentences.append(' '.join([w for w, t in sent]))
indices = []


def tag_sent(chunks, args):
    tagged_sent = []
    global_index = 0
    for chunk in chunks:
Exemple #3
0
#coding=utf8
from hazm import sent_tokenize, word_tokenize, Normalizer, HamshahriReader, SequencePOSTagger, DependencyParser, Chunker, Lemmatizer, DadeganReader
from baaz import DependencyTreeInformationExtractor, ChunkTreeInformationExtractor
from progress.bar import Bar
from nltk import Tree
from itertools import combinations
import codecs, re

arg = lambda chunk: ' '.join([word for word, tag in chunk.leaves()])
#hamshahri = HamshahriReader('Resources/Hamshahri/')
#normalizer = Normalizer()
#tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model')
#parser = DependencyParser(tagger=tagger, lemmatizer=Lemmatizer())
#chunker = Chunker(tagger, model='Resources/chunker-dadeganFull.model')
dependencyExtractor = DependencyTreeInformationExtractor()
chunkExtractor = ChunkTreeInformationExtractor()
texts = []

output = codecs.open('trainingSet_dadegan.txt', 'w', encoding='utf8')

def extractCandidates(chunk_tree):
	candidates = []
	chunks_list = list(chunk_tree)
	for chunk in chunk_tree:
		if type(chunk) is not Tree and chunk[1] == "PUNC":
			chunks_list.remove(chunk)

	for c in range(len(chunks_list)):
		chunk = chunks_list[c]
		if c > 0:
			previuos = chunks_list[c - 1]
Exemple #4
0
import codecs
from hazm import DadeganReader
from baaz import DependencyTreeInformationExtractor, ChunkTreeInformationExtractor


output = codecs.open('resources/informations.txt', 'w', encoding='utf8')
dadegan = DadeganReader('corpora/train.conll')
chunk_extractor = ChunkTreeInformationExtractor()
dependency_extractor = DependencyTreeInformationExtractor()

for chunk_tree, dependency_tree in zip(dadegan.chunked_trees(), dadegan.trees()):
	for information in chunk_extractor.extract(chunk_tree):
		print(*information, sep=' - ', file=output)
	print(file=output)
	for information in dependency_extractor.extract(dependency_tree):
		print(*information, sep=' + ', file=output)
	print(file=output)