from codecs import open as codecs_open from os.path import join, split from hazm import word_tokenize, sent_tokenize, DadeganReader from nltk.tree import Tree open = lambda *args: codecs_open(*args, encoding='utf8') mergeTags = True dadeganDir = 'Resources/Dadegan-pages' dadegan = DadeganReader('Resources/Dadegan/test.conll') chunks = list(dadegan.chunked_trees())[200:300] tsvFp = open('Resources/Dadegan-pages/003.tsv', 'w') sorted_lines = sorted(open('Resources/Dadegan-pages/003.ann').readlines(), key=lambda l: int(l.split(' ')[1])) def getCurrent_line(): current_line = sorted_lines.pop(0) tab_parts = current_line.split('\t') _type, start, end = tab_parts[1].split(' ') start = int(start) end = int(end) text = tab_parts[2] return _type, start, end, text global_index = 0 _type, start, end, text = getCurrent_line() for i in range(100): chunk_tree = chunks[i] for chunk in chunk_tree: if type(chunk) is Tree: tokens = chunk.leaves()
from hazm import DadeganReader from baaz import ChunkTreeInformationExtractor, DependencyTreeInformationExtractor from hazm import Chunker, Normalizer, SequencePOSTagger, IOBTagger, DependencyParser, Lemmatizer import codecs, re from nltk import accuracy from nltk.tree import Tree #output = open('200DadeganSents-chunkExtractor.txt', 'w', encoding='utf8') dadegan = DadeganReader('resources/Dadegan/test.conll') tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model') lemmatizer = Lemmatizer() chunker = Chunker(model='Resources/chunker-dadeganFull.model') parser = DependencyParser(tagger, lemmatizer=lemmatizer ) normalizer = Normalizer() chunk_extractor = ChunkTreeInformationExtractor() dep_extractor = DependencyTreeInformationExtractor() trees = list(dadegan.chunked_trees()) chunk_trees = trees[:100] + trees[200:300] trees = list(dadegan.trees()) dep_trees = trees[:100] + trees[200:300] #dep_output = codecs.open('dep_output.txt', 'w', encoding='utf8') #sentences = [] #for sent in dadegan.sents(): # sentences.append(' '.join([w for w, t in sent])) indices = [] def tag_sent(chunks, args): tagged_sent = [] global_index = 0
for arg in info: arg_list = [] index = sent.strip().find(arg) if index >= 0: tokens = sent.split() for i in range(len(tokens)): index -= len(tokens[i]) + 1 if index < 0: for c in range(i, i + len(arg.split())): arg_list.append(c) break info_list.append(arg_list) return info_list input = codecs.open('200DadeganSents.txt', 'r', encoding='utf8') dadegan = DadeganReader('Resources/Dadegan/train.conll') dadegan_trees = dadegan.trees() informations = [] sentences = [] for sentence in dadegan.sents(): sentences.append(' '.join([w for w, t in sentence])) for tree, chunks, sent in zip(dadegan_trees, dadegan.chunked_trees(), sentences): info_list = ([], [], []) for information in dependencyExtractor.extract(tree): temp_list = positions(information, sent) for i in range(3): if len(temp_list[i]) > 0 and temp_list[i] not in info_list[i]: info_list[i].append(temp_list[i]) if [] in info_list: continue else:
from hazm import DadeganReader from baaz import ChunkTreeInformationExtractor, DependencyTreeInformationExtractor from hazm import Chunker, Normalizer, SequencePOSTagger, IOBTagger, DependencyParser, Lemmatizer import codecs, re from nltk import accuracy from nltk.tree import Tree #output = open('200DadeganSents-chunkExtractor.txt', 'w', encoding='utf8') dadegan = DadeganReader('resources/Dadegan/test.conll') tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model') lemmatizer = Lemmatizer() chunker = Chunker(model='Resources/chunker-dadeganFull.model') parser = DependencyParser(tagger, lemmatizer=lemmatizer) normalizer = Normalizer() chunk_extractor = ChunkTreeInformationExtractor() dep_extractor = DependencyTreeInformationExtractor() trees = list(dadegan.chunked_trees()) chunk_trees = trees[:100] + trees[200:300] trees = list(dadegan.trees()) dep_trees = trees[:100] + trees[200:300] #dep_output = codecs.open('dep_output.txt', 'w', encoding='utf8') #sentences = [] #for sent in dadegan.sents(): # sentences.append(' '.join([w for w, t in sent])) indices = [] def tag_sent(chunks, args): tagged_sent = [] global_index = 0 for chunk in chunks:
from codecs import open as codecs_open from os.path import join, split from hazm import word_tokenize, sent_tokenize, DadeganReader from nltk.tree import Tree open = lambda *args: codecs_open(*args, encoding='utf8') mergeTags = True dadeganDir = 'Resources/Dadegan-pages' dadegan = DadeganReader('Resources/Dadegan/test.conll') chunks = list(dadegan.chunked_trees())[200:300] tsvFp = open('Resources/Dadegan-pages/003.tsv', 'w') sorted_lines = sorted(open('Resources/Dadegan-pages/003.ann').readlines(), key=lambda l: int(l.split(' ')[1])) def getCurrent_line(): current_line = sorted_lines.pop(0) tab_parts = current_line.split('\t') _type, start, end = tab_parts[1].split(' ') start = int(start) end = int(end) text = tab_parts[2] return _type, start, end, text global_index = 0 _type, start, end, text = getCurrent_line() for i in range(100): chunk_tree = chunks[i] for chunk in chunk_tree:
import codecs from hazm import DadeganReader from baaz import DependencyTreeInformationExtractor, ChunkTreeInformationExtractor output = codecs.open('resources/informations.txt', 'w', encoding='utf8') dadegan = DadeganReader('corpora/train.conll') chunk_extractor = ChunkTreeInformationExtractor() dependency_extractor = DependencyTreeInformationExtractor() for chunk_tree, dependency_tree in zip(dadegan.chunked_trees(), dadegan.trees()): for information in chunk_extractor.extract(chunk_tree): print(*information, sep=' - ', file=output) print(file=output) for information in dependency_extractor.extract(dependency_tree): print(*information, sep=' + ', file=output) print(file=output)