def __init__(self): self.uninformatives = set( sum([ line.split(' - ') for line in open( 'data/adverbs.dat', encoding='utf8').read().split('\n') if line.strip() and not line.startswith('#') ], [])) self.normalizer = Normalizer() self.tagger = POSTagger(model='resources/postagger.model') self.parser = DependencyParser(tagger=self.tagger, lemmatizer=Lemmatizer()) self.extractor = DependencyTreeInformationExtractor()
class InformationExtractor(): def __init__(self): self.uninformatives = set( sum([ line.split(' - ') for line in open( 'data/adverbs.dat', encoding='utf8').read().split('\n') if line.strip() and not line.startswith('#') ], [])) self.normalizer = Normalizer() self.tagger = POSTagger(model='resources/postagger.model') self.parser = DependencyParser(tagger=self.tagger, lemmatizer=Lemmatizer()) self.extractor = DependencyTreeInformationExtractor() def analyze(self, text): sentences = [ sentence for sentence in sent_tokenize(self.normalizer.normalize(text)) if len(sentence) > 15 ] if not sentences: return [] for sentence, tree in zip( sentences, self.parser.parse_sents(map(word_tokenize, sentences))): for information in self.extractor.extract(tree): if information[1] not in self.uninformatives: yield information
class InformationExtractor(): def __init__(self): self.uninformatives = set(sum([line.split(' - ') for line in open('data/adverbs.dat', encoding='utf8').read().split('\n') if line.strip() and not line.startswith('#')], [])) self.normalizer = Normalizer() self.tagger = POSTagger(model='resources/postagger.model') self.parser = DependencyParser(tagger=self.tagger, lemmatizer=Lemmatizer()) self.extractor = DependencyTreeInformationExtractor() def analyze(self, text): sentences = [sentence for sentence in sent_tokenize(self.normalizer.normalize(text)) if len(sentence) > 15] if not sentences: return [] for sentence, tree in zip(sentences, self.parser.parse_sents(map(word_tokenize, sentences))): for information in self.extractor.extract(tree): if information[1] not in self.uninformatives: yield information
from __future__ import print_function import codecs from itertools import chain from hazm import * from baaz import DependencyTreeInformationExtractor hamshahri = HamshahriReader(root='corpora/hamshahri') persica = PersicaReader(csv_file='corpora/persica.csv') uninformatives = set(sum([line.split(' - ') for line in codecs.open('data/adverbs.dat', encoding='utf8').read().split('\n') if line.strip() and not line.startswith('#')], [])) normalizer = Normalizer() tagger = POSTagger(model='resources/postagger.model') parser = TurboParser(tagger=tagger, lemmatizer=Lemmatizer(), model_file='resources/turboparser.model') extractor = DependencyTreeInformationExtractor() informations = codecs.open('resources/informations.txt', 'a+', encoding='utf8') processed_sentences = set([line.strip()[2:] for line in informations if line.startswith('#')]) for text in chain(hamshahri.texts(), persica.texts()): try: sentences = [sentence for sentence in sent_tokenize(normalizer.normalize(text)) if len(sentence) > 15 and sentence not in processed_sentences] if not sentences: continue for sentence, tree in zip(sentences, parser.parse_sents(map(word_tokenize, sentences))): print('#', sentence, file=informations)
def __init__(self): self.uninformatives = set(sum([line.split(' - ') for line in open('data/adverbs.dat', encoding='utf8').read().split('\n') if line.strip() and not line.startswith('#')], [])) self.normalizer = Normalizer() self.tagger = POSTagger(model='resources/postagger.model') self.parser = DependencyParser(tagger=self.tagger, lemmatizer=Lemmatizer()) self.extractor = DependencyTreeInformationExtractor()
from baaz import ChunkTreeInformationExtractor, DependencyTreeInformationExtractor from hazm import Chunker, Normalizer, SequencePOSTagger, IOBTagger, DependencyParser, Lemmatizer import codecs, re from nltk import accuracy from nltk.tree import Tree #output = open('200DadeganSents-chunkExtractor.txt', 'w', encoding='utf8') dadegan = DadeganReader('resources/Dadegan/test.conll') tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model') lemmatizer = Lemmatizer() chunker = Chunker(model='Resources/chunker-dadeganFull.model') parser = DependencyParser(tagger, lemmatizer=lemmatizer ) normalizer = Normalizer() chunk_extractor = ChunkTreeInformationExtractor() dep_extractor = DependencyTreeInformationExtractor() trees = list(dadegan.chunked_trees()) chunk_trees = trees[:100] + trees[200:300] trees = list(dadegan.trees()) dep_trees = trees[:100] + trees[200:300] #dep_output = codecs.open('dep_output.txt', 'w', encoding='utf8') #sentences = [] #for sent in dadegan.sents(): # sentences.append(' '.join([w for w, t in sent])) indices = [] def tag_sent(chunks, args): tagged_sent = [] global_index = 0 for chunk in chunks: if type(chunk) is Tree:
#coding=utf8 from hazm import sent_tokenize, word_tokenize, Normalizer, HamshahriReader, SequencePOSTagger, DependencyParser, Chunker, Lemmatizer, DadeganReader from baaz import DependencyTreeInformationExtractor, ChunkTreeInformationExtractor from progress.bar import Bar from nltk import Tree from itertools import combinations import codecs, re arg = lambda chunk: ' '.join([word for word, tag in chunk.leaves()]) #hamshahri = HamshahriReader('Resources/Hamshahri/') #normalizer = Normalizer() #tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model') #parser = DependencyParser(tagger=tagger, lemmatizer=Lemmatizer()) #chunker = Chunker(tagger, model='Resources/chunker-dadeganFull.model') dependencyExtractor = DependencyTreeInformationExtractor() chunkExtractor = ChunkTreeInformationExtractor() texts = [] output = codecs.open('trainingSet_dadegan.txt', 'w', encoding='utf8') def extractCandidates(chunk_tree): candidates = [] chunks_list = list(chunk_tree) for chunk in chunk_tree: if type(chunk) is not Tree and chunk[1] == "PUNC": chunks_list.remove(chunk) for c in range(len(chunks_list)): chunk = chunks_list[c] if c > 0: previuos = chunks_list[c - 1]
from hazm import DadeganReader from baaz import ChunkTreeInformationExtractor, DependencyTreeInformationExtractor from hazm import Chunker, Normalizer, SequencePOSTagger, IOBTagger, DependencyParser, Lemmatizer import codecs, re from nltk import accuracy from nltk.tree import Tree #output = open('200DadeganSents-chunkExtractor.txt', 'w', encoding='utf8') dadegan = DadeganReader('resources/Dadegan/test.conll') tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model') lemmatizer = Lemmatizer() chunker = Chunker(model='Resources/chunker-dadeganFull.model') parser = DependencyParser(tagger, lemmatizer=lemmatizer) normalizer = Normalizer() chunk_extractor = ChunkTreeInformationExtractor() dep_extractor = DependencyTreeInformationExtractor() trees = list(dadegan.chunked_trees()) chunk_trees = trees[:100] + trees[200:300] trees = list(dadegan.trees()) dep_trees = trees[:100] + trees[200:300] #dep_output = codecs.open('dep_output.txt', 'w', encoding='utf8') #sentences = [] #for sent in dadegan.sents(): # sentences.append(' '.join([w for w, t in sent])) indices = [] def tag_sent(chunks, args): tagged_sent = [] global_index = 0 for chunk in chunks:
import codecs from hazm import DadeganReader from baaz import DependencyTreeInformationExtractor, ChunkTreeInformationExtractor output = codecs.open('resources/informations.txt', 'w', encoding='utf8') dadegan = DadeganReader('corpora/train.conll') chunk_extractor = ChunkTreeInformationExtractor() dependency_extractor = DependencyTreeInformationExtractor() for chunk_tree, dependency_tree in zip(dadegan.chunked_trees(), dadegan.trees()): for information in chunk_extractor.extract(chunk_tree): print(*information, sep=' - ', file=output) print(file=output) for information in dependency_extractor.extract(dependency_tree): print(*information, sep=' + ', file=output) print(file=output)