from hazm import DadeganReader from baaz import ChunkTreeInformationExtractor, DependencyTreeInformationExtractor from hazm import Chunker, Normalizer, SequencePOSTagger, IOBTagger, DependencyParser, Lemmatizer import codecs, re from nltk import accuracy from nltk.tree import Tree #output = open('200DadeganSents-chunkExtractor.txt', 'w', encoding='utf8') dadegan = DadeganReader('resources/Dadegan/test.conll') tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model') lemmatizer = Lemmatizer() chunker = Chunker(model='Resources/chunker-dadeganFull.model') parser = DependencyParser(tagger, lemmatizer=lemmatizer ) normalizer = Normalizer() chunk_extractor = ChunkTreeInformationExtractor() dep_extractor = DependencyTreeInformationExtractor() trees = list(dadegan.chunked_trees()) chunk_trees = trees[:100] + trees[200:300] trees = list(dadegan.trees()) dep_trees = trees[:100] + trees[200:300] #dep_output = codecs.open('dep_output.txt', 'w', encoding='utf8') #sentences = [] #for sent in dadegan.sents(): # sentences.append(' '.join([w for w, t in sent])) indices = [] def tag_sent(chunks, args): tagged_sent = [] global_index = 0 for chunk in chunks:
from hazm import DadeganReader from baaz import ChunkTreeInformationExtractor, DependencyTreeInformationExtractor from hazm import Chunker, Normalizer, SequencePOSTagger, IOBTagger, DependencyParser, Lemmatizer import codecs, re from nltk import accuracy from nltk.tree import Tree #output = open('200DadeganSents-chunkExtractor.txt', 'w', encoding='utf8') dadegan = DadeganReader('resources/Dadegan/test.conll') tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model') lemmatizer = Lemmatizer() chunker = Chunker(model='Resources/chunker-dadeganFull.model') parser = DependencyParser(tagger, lemmatizer=lemmatizer) normalizer = Normalizer() chunk_extractor = ChunkTreeInformationExtractor() dep_extractor = DependencyTreeInformationExtractor() trees = list(dadegan.chunked_trees()) chunk_trees = trees[:100] + trees[200:300] trees = list(dadegan.trees()) dep_trees = trees[:100] + trees[200:300] #dep_output = codecs.open('dep_output.txt', 'w', encoding='utf8') #sentences = [] #for sent in dadegan.sents(): # sentences.append(' '.join([w for w, t in sent])) indices = [] def tag_sent(chunks, args): tagged_sent = [] global_index = 0 for chunk in chunks:
#coding=utf8 from hazm import sent_tokenize, word_tokenize, Normalizer, HamshahriReader, SequencePOSTagger, DependencyParser, Chunker, Lemmatizer, DadeganReader from baaz import DependencyTreeInformationExtractor, ChunkTreeInformationExtractor from progress.bar import Bar from nltk import Tree from itertools import combinations import codecs, re arg = lambda chunk: ' '.join([word for word, tag in chunk.leaves()]) #hamshahri = HamshahriReader('Resources/Hamshahri/') #normalizer = Normalizer() #tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model') #parser = DependencyParser(tagger=tagger, lemmatizer=Lemmatizer()) #chunker = Chunker(tagger, model='Resources/chunker-dadeganFull.model') dependencyExtractor = DependencyTreeInformationExtractor() chunkExtractor = ChunkTreeInformationExtractor() texts = [] output = codecs.open('trainingSet_dadegan.txt', 'w', encoding='utf8') def extractCandidates(chunk_tree): candidates = [] chunks_list = list(chunk_tree) for chunk in chunk_tree: if type(chunk) is not Tree and chunk[1] == "PUNC": chunks_list.remove(chunk) for c in range(len(chunks_list)): chunk = chunks_list[c] if c > 0: previuos = chunks_list[c - 1]
import codecs from hazm import DadeganReader from baaz import DependencyTreeInformationExtractor, ChunkTreeInformationExtractor output = codecs.open('resources/informations.txt', 'w', encoding='utf8') dadegan = DadeganReader('corpora/train.conll') chunk_extractor = ChunkTreeInformationExtractor() dependency_extractor = DependencyTreeInformationExtractor() for chunk_tree, dependency_tree in zip(dadegan.chunked_trees(), dadegan.trees()): for information in chunk_extractor.extract(chunk_tree): print(*information, sep=' - ', file=output) print(file=output) for information in dependency_extractor.extract(dependency_tree): print(*information, sep=' + ', file=output) print(file=output)