コード例 #1
0
 def __init__(self):
     self.uninformatives = set(
         sum([
             line.split(' - ') for line in open(
                 'data/adverbs.dat', encoding='utf8').read().split('\n')
             if line.strip() and not line.startswith('#')
         ], []))
     self.normalizer = Normalizer()
     self.tagger = POSTagger(model='resources/postagger.model')
     self.parser = DependencyParser(tagger=self.tagger,
                                    lemmatizer=Lemmatizer())
     self.extractor = DependencyTreeInformationExtractor()
コード例 #2
0
class InformationExtractor():
    def __init__(self):
        self.uninformatives = set(
            sum([
                line.split(' - ') for line in open(
                    'data/adverbs.dat', encoding='utf8').read().split('\n')
                if line.strip() and not line.startswith('#')
            ], []))
        self.normalizer = Normalizer()
        self.tagger = POSTagger(model='resources/postagger.model')
        self.parser = DependencyParser(tagger=self.tagger,
                                       lemmatizer=Lemmatizer())
        self.extractor = DependencyTreeInformationExtractor()

    def analyze(self, text):
        sentences = [
            sentence
            for sentence in sent_tokenize(self.normalizer.normalize(text))
            if len(sentence) > 15
        ]
        if not sentences:
            return []

        for sentence, tree in zip(
                sentences,
                self.parser.parse_sents(map(word_tokenize, sentences))):
            for information in self.extractor.extract(tree):
                if information[1] not in self.uninformatives:
                    yield information
コード例 #3
0
ファイル: InformationExtractor.py プロジェクト: sobhe/baaz
class InformationExtractor():

	def __init__(self):
		self.uninformatives = set(sum([line.split(' - ') for line in open('data/adverbs.dat', encoding='utf8').read().split('\n') if line.strip() and not line.startswith('#')], []))
		self.normalizer = Normalizer()
		self.tagger = POSTagger(model='resources/postagger.model')
		self.parser = DependencyParser(tagger=self.tagger, lemmatizer=Lemmatizer())
		self.extractor = DependencyTreeInformationExtractor()

	def analyze(self, text):
		sentences = [sentence for sentence in sent_tokenize(self.normalizer.normalize(text)) if len(sentence) > 15]
		if not sentences:
			return []

		for sentence, tree in zip(sentences, self.parser.parse_sents(map(word_tokenize, sentences))):
			for information in self.extractor.extract(tree):
				if information[1] not in self.uninformatives:
					yield information
コード例 #4
0
ファイル: extract.py プロジェクト: RaminNietzsche/baaz
from __future__ import print_function
import codecs
from itertools import chain
from hazm import *
from baaz import DependencyTreeInformationExtractor


hamshahri = HamshahriReader(root='corpora/hamshahri')
persica = PersicaReader(csv_file='corpora/persica.csv')
uninformatives = set(sum([line.split(' - ') for line in codecs.open('data/adverbs.dat', encoding='utf8').read().split('\n') if line.strip() and not line.startswith('#')], []))


normalizer = Normalizer()
tagger = POSTagger(model='resources/postagger.model')
parser = TurboParser(tagger=tagger, lemmatizer=Lemmatizer(), model_file='resources/turboparser.model')
extractor = DependencyTreeInformationExtractor()


informations = codecs.open('resources/informations.txt', 'a+', encoding='utf8')
processed_sentences = set([line.strip()[2:] for line in informations if line.startswith('#')])


for text in chain(hamshahri.texts(), persica.texts()):
	try:
		sentences = [sentence for sentence in sent_tokenize(normalizer.normalize(text)) if len(sentence) > 15 and sentence not in processed_sentences]
		if not sentences:
			continue

		for sentence, tree in zip(sentences, parser.parse_sents(map(word_tokenize, sentences))):
			print('#', sentence, file=informations)
コード例 #5
0
ファイル: InformationExtractor.py プロジェクト: sobhe/baaz
	def __init__(self):
		self.uninformatives = set(sum([line.split(' - ') for line in open('data/adverbs.dat', encoding='utf8').read().split('\n') if line.strip() and not line.startswith('#')], []))
		self.normalizer = Normalizer()
		self.tagger = POSTagger(model='resources/postagger.model')
		self.parser = DependencyParser(tagger=self.tagger, lemmatizer=Lemmatizer())
		self.extractor = DependencyTreeInformationExtractor()
コード例 #6
0
from baaz import ChunkTreeInformationExtractor, DependencyTreeInformationExtractor
from hazm import Chunker, Normalizer, SequencePOSTagger, IOBTagger, DependencyParser, Lemmatizer
import codecs, re
from nltk import accuracy
from nltk.tree import Tree


#output = open('200DadeganSents-chunkExtractor.txt', 'w', encoding='utf8')
dadegan = DadeganReader('resources/Dadegan/test.conll')
tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model')
lemmatizer = Lemmatizer()
chunker = Chunker(model='Resources/chunker-dadeganFull.model')
parser = DependencyParser(tagger, lemmatizer=lemmatizer )
normalizer = Normalizer()
chunk_extractor = ChunkTreeInformationExtractor()
dep_extractor = DependencyTreeInformationExtractor()
trees = list(dadegan.chunked_trees())
chunk_trees = trees[:100] + trees[200:300]
trees = list(dadegan.trees())
dep_trees = trees[:100] + trees[200:300]
#dep_output = codecs.open('dep_output.txt', 'w', encoding='utf8')
#sentences = []
#for sent in dadegan.sents():
#	sentences.append(' '.join([w for w, t in sent]))
indices = []

def tag_sent(chunks, args):
	tagged_sent = []
	global_index = 0
	for chunk in chunks:
		if type(chunk) is Tree:
コード例 #7
0
#coding=utf8
from hazm import sent_tokenize, word_tokenize, Normalizer, HamshahriReader, SequencePOSTagger, DependencyParser, Chunker, Lemmatizer, DadeganReader
from baaz import DependencyTreeInformationExtractor, ChunkTreeInformationExtractor
from progress.bar import Bar
from nltk import Tree
from itertools import combinations
import codecs, re

arg = lambda chunk: ' '.join([word for word, tag in chunk.leaves()])
#hamshahri = HamshahriReader('Resources/Hamshahri/')
#normalizer = Normalizer()
#tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model')
#parser = DependencyParser(tagger=tagger, lemmatizer=Lemmatizer())
#chunker = Chunker(tagger, model='Resources/chunker-dadeganFull.model')
dependencyExtractor = DependencyTreeInformationExtractor()
chunkExtractor = ChunkTreeInformationExtractor()
texts = []

output = codecs.open('trainingSet_dadegan.txt', 'w', encoding='utf8')

def extractCandidates(chunk_tree):
	candidates = []
	chunks_list = list(chunk_tree)
	for chunk in chunk_tree:
		if type(chunk) is not Tree and chunk[1] == "PUNC":
			chunks_list.remove(chunk)

	for c in range(len(chunks_list)):
		chunk = chunks_list[c]
		if c > 0:
			previuos = chunks_list[c - 1]
コード例 #8
0
ファイル: 200DadeganSents.py プロジェクト: ecnumjc/baaz
from hazm import DadeganReader
from baaz import ChunkTreeInformationExtractor, DependencyTreeInformationExtractor
from hazm import Chunker, Normalizer, SequencePOSTagger, IOBTagger, DependencyParser, Lemmatizer
import codecs, re
from nltk import accuracy
from nltk.tree import Tree

#output = open('200DadeganSents-chunkExtractor.txt', 'w', encoding='utf8')
dadegan = DadeganReader('resources/Dadegan/test.conll')
tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model')
lemmatizer = Lemmatizer()
chunker = Chunker(model='Resources/chunker-dadeganFull.model')
parser = DependencyParser(tagger, lemmatizer=lemmatizer)
normalizer = Normalizer()
chunk_extractor = ChunkTreeInformationExtractor()
dep_extractor = DependencyTreeInformationExtractor()
trees = list(dadegan.chunked_trees())
chunk_trees = trees[:100] + trees[200:300]
trees = list(dadegan.trees())
dep_trees = trees[:100] + trees[200:300]
#dep_output = codecs.open('dep_output.txt', 'w', encoding='utf8')
#sentences = []
#for sent in dadegan.sents():
#	sentences.append(' '.join([w for w, t in sent]))
indices = []


def tag_sent(chunks, args):
    tagged_sent = []
    global_index = 0
    for chunk in chunks:
コード例 #9
0
ファイル: dadegan.py プロジェクト: RaminNietzsche/baaz
import codecs
from hazm import DadeganReader
from baaz import DependencyTreeInformationExtractor, ChunkTreeInformationExtractor


output = codecs.open('resources/informations.txt', 'w', encoding='utf8')
dadegan = DadeganReader('corpora/train.conll')
chunk_extractor = ChunkTreeInformationExtractor()
dependency_extractor = DependencyTreeInformationExtractor()

for chunk_tree, dependency_tree in zip(dadegan.chunked_trees(), dadegan.trees()):
	for information in chunk_extractor.extract(chunk_tree):
		print(*information, sep=' - ', file=output)
	print(file=output)
	for information in dependency_extractor.extract(dependency_tree):
		print(*information, sep=' + ', file=output)
	print(file=output)