class InformationExtractor(): def __init__(self): self.uninformatives = set( sum([ line.split(' - ') for line in open( 'data/adverbs.dat', encoding='utf8').read().split('\n') if line.strip() and not line.startswith('#') ], [])) self.normalizer = Normalizer() self.tagger = POSTagger(model='resources/postagger.model') self.parser = DependencyParser(tagger=self.tagger, lemmatizer=Lemmatizer()) self.extractor = DependencyTreeInformationExtractor() def analyze(self, text): sentences = [ sentence for sentence in sent_tokenize(self.normalizer.normalize(text)) if len(sentence) > 15 ] if not sentences: return [] for sentence, tree in zip( sentences, self.parser.parse_sents(map(word_tokenize, sentences))): for information in self.extractor.extract(tree): if information[1] not in self.uninformatives: yield information
class InformationExtractor(): def __init__(self): self.uninformatives = set(sum([line.split(' - ') for line in open('data/adverbs.dat', encoding='utf8').read().split('\n') if line.strip() and not line.startswith('#')], [])) self.normalizer = Normalizer() self.tagger = POSTagger(model='resources/postagger.model') self.parser = DependencyParser(tagger=self.tagger, lemmatizer=Lemmatizer()) self.extractor = DependencyTreeInformationExtractor() def analyze(self, text): sentences = [sentence for sentence in sent_tokenize(self.normalizer.normalize(text)) if len(sentence) > 15] if not sentences: return [] for sentence, tree in zip(sentences, self.parser.parse_sents(map(word_tokenize, sentences))): for information in self.extractor.extract(tree): if information[1] not in self.uninformatives: yield information
persica = PersicaReader(csv_file='corpora/persica.csv') uninformatives = set(sum([line.split(' - ') for line in codecs.open('data/adverbs.dat', encoding='utf8').read().split('\n') if line.strip() and not line.startswith('#')], [])) normalizer = Normalizer() tagger = POSTagger(model='resources/postagger.model') parser = TurboParser(tagger=tagger, lemmatizer=Lemmatizer(), model_file='resources/turboparser.model') extractor = DependencyTreeInformationExtractor() informations = codecs.open('resources/informations.txt', 'a+', encoding='utf8') processed_sentences = set([line.strip()[2:] for line in informations if line.startswith('#')]) for text in chain(hamshahri.texts(), persica.texts()): try: sentences = [sentence for sentence in sent_tokenize(normalizer.normalize(text)) if len(sentence) > 15 and sentence not in processed_sentences] if not sentences: continue for sentence, tree in zip(sentences, parser.parse_sents(map(word_tokenize, sentences))): print('#', sentence, file=informations) for information in extractor.extract(tree): if information[1] not in uninformatives: print(*information, sep=' - ', file=informations) print(file=informations) except Exception as error: print(error, 'while prcoessing:', *sentences, sep='\n')
sentences = [] evaluation_sents = [] for gold_sent in gold: sentences.append([w for w, t, c, l in gold_sent]) #tokens = tagger.tag_sents(sentences) #chunk_trees = list(chunker.parse_sents(tokens)) #dep_trees = parser.parse_sents(sentences) dep_tagged_sents = [] chunk_tagged_sents = [] for number, gold_sent in enumerate(gold): sentence = ' '.join(sentences[number]) chunk_tree = chunk_trees[number] dep_tree = dep_trees[number] chunk_informations = list(chunk_extractor.extract(chunk_tree)) dep_informations = list(dep_extractor.extract(dep_tree)) evaluation_sent = [(w, l) for w, t, c, l in gold_sent] dep_tagged_sent = [(w,l) for w, t, c, l in [tokens for tokens in info2iob(sentence, chunk_tree, dep_informations)]] chunk_tagged_sent = [(w,l) for w, t, c, l in [tokens for tokens in info2iob(sentence, chunk_tree, chunk_informations)]] if len(evaluation_sent) == len(dep_tagged_sent): evaluation_sents.append(evaluation_sent) dep_tagged_sents.append(dep_tagged_sent) chunk_tagged_sents.append(chunk_tagged_sent) else: print(chunk_tagged_sent) print() print('dependency accuracy: %f' % (accuracy(sum(evaluation_sents, []), sum(dep_tagged_sents, [])))) print('chunk accuracy: %f' % (accuracy(sum(evaluation_sents, []), sum(chunk_tagged_sents, [])))) information_tagger = IOBTagger(model='informations-all.model') print(information_tagger.evaluate(gold))
for c in range(i, i + len(arg.split())): arg_list.append(c) break info_list.append(arg_list) return info_list input = codecs.open('200DadeganSents.txt', 'r', encoding='utf8') dadegan = DadeganReader('Resources/Dadegan/train.conll') dadegan_trees = dadegan.trees() informations = [] sentences = [] for sentence in dadegan.sents(): sentences.append(' '.join([w for w, t in sentence])) for tree, chunks, sent in zip(dadegan_trees, dadegan.chunked_trees(), sentences): info_list = ([], [], []) for information in dependencyExtractor.extract(tree): temp_list = positions(information, sent) for i in range(3): if len(temp_list[i]) > 0 and temp_list[i] not in info_list[i]: info_list[i].append(temp_list[i]) if [] in info_list: continue else: tag_sent(chunks, info_list) """ for line in input.readlines(): if len(re.findall(r'^\d+-', line)) == 0: if len(line) > 1: informations.append(line.replace('\n', '').split(' + ')) else:
sentences = [] evaluation_sents = [] for gold_sent in gold: sentences.append([w for w, t, c, l in gold_sent]) #tokens = tagger.tag_sents(sentences) #chunk_trees = list(chunker.parse_sents(tokens)) #dep_trees = parser.parse_sents(sentences) dep_tagged_sents = [] chunk_tagged_sents = [] for number, gold_sent in enumerate(gold): sentence = ' '.join(sentences[number]) chunk_tree = chunk_trees[number] dep_tree = dep_trees[number] chunk_informations = list(chunk_extractor.extract(chunk_tree)) dep_informations = list(dep_extractor.extract(dep_tree)) evaluation_sent = [(w, l) for w, t, c, l in gold_sent] dep_tagged_sent = [(w, l) for w, t, c, l in [ tokens for tokens in info2iob(sentence, chunk_tree, dep_informations) ]] chunk_tagged_sent = [(w, l) for w, t, c, l in [ tokens for tokens in info2iob(sentence, chunk_tree, chunk_informations) ]] if len(evaluation_sent) == len(dep_tagged_sent): evaluation_sents.append(evaluation_sent) dep_tagged_sents.append(dep_tagged_sent) chunk_tagged_sents.append(chunk_tagged_sent) else: print(chunk_tagged_sent) print() print('dependency accuracy: %f' %
normalizer = Normalizer() tagger = POSTagger(model='resources/postagger.model') # parser = TurboParser(tagger=tagger, lemmatizer=Lemmatizer(), model_file='resources/turboparser.model') parser = DependencyParser(tagger=tagger, lemmatizer=Lemmatizer()) extractor = DependencyTreeInformationExtractor() informations = codecs.open('resources/informations.txt', 'a+', encoding='utf8') processed_sentences = set([line.strip()[2:] for line in informations if line.startswith('#')]) errors = codecs.open('resources/errors.txt', 'w', encoding='utf8') for text in chain(hamshahri.texts(), persica.texts()): try: sentences = [sentence for sentence in sent_tokenize(normalizer.normalize(text)) if len(sentence) > 15 and sentence not in processed_sentences] if not sentences: continue for sentence, tree in zip(sentences, parser.parse_sents(map(word_tokenize, sentences))): print('#', sentence, file=informations) for information in extractor.extract(tree): if information[1] not in uninformatives: print(*information, sep=' - ', file=informations) print(file=informations) except Exception as error: print(error, 'while prcoessing:', *sentences, sep='\n', file=errors)
import codecs from hazm import DadeganReader from baaz import DependencyTreeInformationExtractor, ChunkTreeInformationExtractor output = codecs.open('resources/informations.txt', 'w', encoding='utf8') dadegan = DadeganReader('corpora/train.conll') chunk_extractor = ChunkTreeInformationExtractor() dependency_extractor = DependencyTreeInformationExtractor() for chunk_tree, dependency_tree in zip(dadegan.chunked_trees(), dadegan.trees()): for information in chunk_extractor.extract(chunk_tree): print(*information, sep=' - ', file=output) print(file=output) for information in dependency_extractor.extract(dependency_tree): print(*information, sep=' + ', file=output) print(file=output)