Exemple #1
0
 def __init__(self, inFile, outFile):
     self.inFile = inFile
     self.outFile = outFile
     self.normalizer = Normalizer()
     self.tagger = POSTagger(model='resources/postagger.model')
     self.lemmatizer = Lemmatizer()
     self.stemmer = Stemmer()
Exemple #2
0
    def __init__(self, component_config: Dict[Text, Any] = None) -> None:

        super().__init__(component_config)
        if self.component_config.stemmer:
            self._stemmer = Stemmer()

        if self.component_config.lemmatizer:
            self._lemmatizer = Lemmatizer()

        if self.component_config.pos:
            self._pos_tagger = POSTagger(model='resources/postagger.model')
	def nouns(self, texts):
		total_count = len(texts)
		tagger = POSTagger()
		nouns = []
		tagged_doc = tagger.tag_sents(texts)
		for sent in tagged_doc:
			sentence = []
			for word, tag in sent:
				if tag == 'N':
					sentence.append(word)
			nouns.append(sentence)

		return nouns
def hazmtoalpheios(word,uri):
    wordslist = etree.Element("words")
    normalizer = Normalizer()
    data = normalizer.normalize(word)
    sentences = sent_tokenize(data)
    words = []
    for sentence in sentences:
        if words:
            words = words.append(word_tokenize(sentence))
        else:
            words = word_tokenize(sentence)
    analyses = []
    for item in words:
        stemmer = Stemmer()
        wordstem = stemmer.stem(item)
        lemmatizer = Lemmatizer()
        wordlema = lemmatizer.lemmatize(item)
        if '#' in wordlema:
            worldleam, garbage = wordlema.split("#")
        tagger = POSTagger(model=os.path.join(model_path,"postagger.model"))
        wordtagged = tagger.tag(item)
        wordpofs = wordtagged[0][1]
        wordpofs = maptohazm(wordpofs)
        # a better way to do this would be to create a Python class
        # to formalize the abstraction
        analysis = {}
        analysis['engine'] = 'hazm'
        analysis['uri'] = uri
        analysis['form'] = {}
        analysis['form']['text'] = item
        analysis['form']['lang'] = 'per'
        analysis['entries'] = []
        entry = {}
        entry['dict'] = {}
        entry['dict']['hdwd'] = {}
        entry['dict']['hdwd']['lang'] = 'per'
        entry['dict']['hdwd']['text'] = wordstem
        entry['infls'] = []
        infl = {}
        infl['stem'] = {} 
        infl['stem']['text'] = wordstem
        infl['stem']['lang'] = 'per'
        infl['pofs'] = {}
        if wordpofs:
            infl['pofs']['order'] = str(wordpofs[1])
            infl['pofs']['text'] = wordpofs[0]
        entry['infls'].append(infl)
        analysis['entries'].append(entry)
        analyses.append(analysis)
    return analyses
Exemple #5
0
def train_pos_tagger(bijankhan_file='resources/bijankhan.txt', path_to_model='resources/persian.tagger', path_to_jar='resources/stanford-postagger.jar', properties_file='resources/persian.tagger.props', memory_min='-Xms1g', memory_max='-Xmx2g', test_split=.1):
	bijankhan = BijankhanReader(bijankhan_file)
	train_file = 'resources/tagger_train_data.txt'
	output = codecs.open(train_file, 'w', 'utf8')
	sentences = list(bijankhan.sents())
	train_part = int(len(sentences) * (1 - test_split))

	for sentence in sentences[:train_part]:
		print(*(map(lambda w: '/'.join(w).replace(' ', '_'), sentence)), file=output)
	cmd = ['java', memory_min, memory_max, '-classpath', path_to_jar, 'edu.stanford.nlp.tagger.maxent.MaxentTagger', '-prop', properties_file, '-model', path_to_model,  '-trainFile', train_file, '-tagSeparator', '/', '-search', 'owlqn2']
	process = subprocess.Popen(cmd)
	process.wait()

	tagger = POSTagger()
	print('\n\n', 'Tagger Accuracy on Test Split:', tagger.evaluate(sentences[train_part:]))
 def __init__(self, code, config, **kwargs):
     """ Constructor
     :param code: code
     :type code: str
     :param config: app config
     :type config: dict
     """
     super(HazmEngine, self).__init__(code, config, **kwargs)
     self.code = code
     self.config = config
     self.code = code
     self.oa_transformer = OaLegacyTransformer()
     self.language_codes = ['per', 'fas']
     self.uri = self.config['PARSERS_HAZM_URI']
     self.tagger = POSTagger(model=os.path.join(os.path.dirname(__file__),
                                                'hazm', "postagger.model"))
def worker(identifier, skip, count):
    tagger = POSTagger()
    done = 0
    start = time.time()
    stopwords = load_stopwords()
    documents_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[
        Settings.HAMSHAHRI_DATABASE][Settings.HAMSHAHRI_COLLECTION]
    tags_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[
        Settings.TAGS_DATABASE][Settings.HAMSHAHRI_COLLECTION]

    batch_size = 50
    for batch in range(0, count, batch_size):
        hamshahri_cursor = documents_collection.find().skip(
            skip + batch).limit(batch_size)
        for doc in hamshahri_cursor:
            words = []
            sentences = sent_tokenize(doc['text'])
            sents = []
            for sentence in sentences:
                tokens = word_tokenize(sentence)
                text = [word for word in tokens if word not in stopwords]
                sents.append(text)

            tags = tagger.tag_sents(sents)
            for sent in tags:
                for word, tag in sent:
                    words.append({'word': word, "pos": tag})

            tags_collection.insert({
                "id": doc["id"],
                "categories_fa": doc["categories_fa"],
                "text": doc["text"],
                "words": words
            })

            done += 1
            #if done % 100 == 0:
            end = time.time()
            print 'Worker' + str(identifier) + ': Done ' + str(
                done) + ' out of ' + str(count) + ' in ' + (
                    "%.2f" %
                    (end - start)) + ' sec ~ ' + ("%.2f" %
                                                  (done /
                                                   (end - start))) + '/sec'
            sys.stdout.flush()
Exemple #8
0
def hazmtoalpheios(word,uri):
    wordslist = etree.Element("words")
    normalizer = Normalizer()
    item = normalizer.normalize(word)
    analyses = []
    stemmer = Stemmer()
    wordstem = stemmer.stem(item)
    lemmatizer = Lemmatizer()
    wordlema = lemmatizer.lemmatize(item)
    if '#' in wordlema:
        worldleam, garbage = wordlema.split("#")
    tagger = POSTagger(model=os.path.join(model_path,"postagger.model"))
    wordtagged = tagger.tag(word_tokenize(item))
    wordpofs = wordtagged[0][1]
    wordpofs = maptohazm(wordpofs)
    # a better way to do this would be to create a Python class
    # to formalize the abstraction
    analysis = {}
    analysis['engine'] = 'hazm'
    analysis['uri'] = uri
    analysis['form'] = {}
    analysis['form']['text'] = item
    analysis['form']['lang'] = 'per'
    analysis['entries'] = []
    entry = {}
    entry['dict'] = {}
    entry['dict']['hdwd'] = {}
    entry['dict']['hdwd']['lang'] = 'per'
    entry['dict']['hdwd']['text'] = wordstem
    entry['infls'] = []
    infl = {}
    infl['stem'] = {} 
    infl['stem']['text'] = wordstem
    infl['stem']['lang'] = 'per'
    infl['pofs'] = {}
    if wordpofs:
        infl['pofs']['order'] = str(wordpofs[1])
        infl['pofs']['text'] = wordpofs[0]
    entry['infls'].append(infl)
    analysis['entries'].append(entry)
    analyses.append(analysis)
    return analyses
Exemple #9
0
def hazmtoalpheiosfile(data,uri):
    root = etree.Element("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF")    
    oaannotation = etree.SubElement(root,'{http://www.w3.org/ns/oa#}Annotation',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':'http://services.projectbamboo.org/morphology'+uri})
    oahasbody = etree.SubElement(oaannotation, '{http://www.w3.org/ns/oa#}hasBody',)
    oahastarget = etree.SubElement(oaannotation,'{http://www.w3.org/ns/oa#}hasTarget')
    hasbodydesc = etree.SubElement(oahastarget,'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri})
    ispartof = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}isPartOf',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri})
    source = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}source',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource':uri})
    title = etree.SubElement(oaannotation, '{http://purl.org/dc/elements/1.1/}title', {'{http://www.w3.org/XML/1998/namespace}lang':'eng'})
    title.text = "Morphology of " + uri
    wordslist = etree.SubElement("words")
    normalizer = Normalizer()
    data = normalizer.normalize(data)
    sentences = sent_tokenize(data)
    words = []
    for sentence in sentences:
        if words:
            words = words.append(word_tokenize(sentence))
        else:
            words = word_tokenize(sentence)
    for item in words:
        stemmer = Stemmer()
        wordstem = stemmer.stem(item)
        lemmatizer = Lemmatizer()
        wordlema = lemmatizer.lemmatize(item)
        if '#' in wordlema:
            worldleam, garbage = wordlema.split("#")
        tagger = POSTagger(model=os.path.join(model_path,"postagger.model"))
        wordtagged = tagger.tag(item)
        wordpofs = wordtagged[0][1]
        word = etree.SubElement(wordslist,'word')
        form = etree.SubElement(word, 'form', {'{http://www.w3.org/XML/1998/namespace}lang':'per'})
        form.text = item
        entry = etree.SubElement(word, 'entry')
        infl = etree.SubElement(entry,'inlf')
        term = etree.SubElement(infl, 'term', {'{http://www.w3.org/XML/1998/namespace}lang':'per'})
        stem = etree.SubElement(term, 'stem')
        stem.text = wordstem
        pofs = etree.SubElement(infl, 'pofs')
        pofs.text = wordpofs
    return root
def hazmtoalpheiosfile(data,uri):
    root = etree.Element("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF")    
    oaannotation = etree.SubElement(root,'{http://www.w3.org/ns/oa#}Annotation',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':'http://services.projectbamboo.org/morphology'+uri})
    oahasbody = etree.SubElement(oaannotation, '{http://www.w3.org/ns/oa#}hasBody',)
    oahastarget = etree.SubElement(oaannotation,'{http://www.w3.org/ns/oa#}hasTarget')
    hasbodydesc = etree.SubElement(oahastarget,'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri})
    ispartof = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}isPartOf',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri})
    source = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}source',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource':uri})
    title = etree.SubElement(oaannotation, '{http://purl.org/dc/elements/1.1/}title', {'{http://www.w3.org/XML/1998/namespace}lang':'eng'})
    title.text = "Morphology of " + uri
    wordslist = etree.SubElement("words")
    normalizer = Normalizer()
    data = normalizer.normalize(data)
    sentences = sent_tokenize(data)
    words = []
    for sentence in sentences:
        if words:
            words = words.append(word_tokenize(sentence))
        else:
            words = word_tokenize(sentence)
    for item in words:
        stemmer = Stemmer()
        wordstem = stemmer.stem(item)
        lemmatizer = Lemmatizer()
        wordlema = lemmatizer.lemmatize(item)
        if '#' in wordlema:
            worldleam, garbage = wordlema.split("#")
        tagger = POSTagger(model=os.path.join(model_path,"postagger.model"))
        wordtagged = tagger.tag(item)
        wordpofs = wordtagged[0][1]
        word = etree.SubElement(wordslist,'word')
        form = etree.SubElement(word, 'form', {'{http://www.w3.org/XML/1998/namespace}lang':'per'})
        form.text = item
        entry = etree.SubElement(word, 'entry')
        infl = etree.SubElement(entry,'inlf')
        term = etree.SubElement(infl, 'term', {'{http://www.w3.org/XML/1998/namespace}lang':'per'})
        stem = etree.SubElement(term, 'stem')
        stem.text = wordstem
        pofs = etree.SubElement(infl, 'pofs')
        pofs.text = wordpofs
    return root
Exemple #11
0
    def __init__(self,
                 corpus_path='resources/corpus.json',
                 symbols_json_path='resources/symbols.json',
                 persian_lang_path='resources/persian_lang.json',
                 postagger_model_path='resources/postagger.model',
                 max_keyword_num=10, min_keyword_occurrences=0.01, expand_corpus=False):
        self.postagger_model_path = postagger_model_path
        self.symbols_json_path = symbols_json_path
        self.corpus_path = corpus_path
        self.corpus = {}
        self.docs_num = 0
        self.expand_corpus = expand_corpus

        if self.corpus_path is not None:
            with open(corpus_path, encoding='utf-8') as json_file:
                corpus = json.load(json_file)
            self.corpus = corpus['corpus']
            self.docs_num = corpus['docs_num']

        with open(symbols_json_path, encoding='utf-8') as json_file:
            data = json.load(json_file)
        lst = list(data.values())
        self.all_symbols_list = [item for sublist in lst for item in sublist]

        with open(persian_lang_path, encoding='utf-8') as json_file:
            persian_lang = json.load(json_file)

        self.epic_keywords = persian_lang['epic_keywords']
        self.punctuations = persian_lang['punctuations']
        self.persian_alphabet = persian_lang['persian_alphabet']
        self.stop_words = persian_lang['stop_words']

        self.tagger = POSTagger(model=self.postagger_model_path)
        self.normalizer = Normalizer()
        self.max_keyword_num = max_keyword_num
        self.min_keyword_occurrences = min_keyword_occurrences
 def __init__(self, question, useStemmer = False, useSynonyms = False, removeStopwords = False):
     self.question = question
     self.useStemmer = useStemmer
     self.useSynonyms = useSynonyms
     self.removeStopwords = removeStopwords
     self.stopWords = stopwords.words("english")
     self.stem = lambda k : k.lower()
     if self.useStemmer:
         ps = PorterStemmer()
         self.stem = ps.stem
     self.qType = self.determineQuestionType(question)
     self.searchQuery = self.buildSearchQuery(question)
     self.qVector = self.getQueryVector(self.searchQuery)
     self.aType = self.determineAnswerType(question)
     post = POSTagger()
Exemple #13
0
def train_dependency_parser(train_file='resources/train.conll', test_file='resources/test.conll', model_file='langModel.mco', path_to_jar='resources/malt.jar', options_file='resources/options.xml', features_file='resources/features.xml', memory_min='-Xms7g', memory_max='-Xmx8g'):

	def read_conll(conll_file):
		trees = [DependencyGraph(item) for item in dadegan_text(conll_file).replace(' ', '_').split('\n\n') if item.strip()]
		sentences = [[node['word'] for node in tree.nodelist[1:]] for tree in trees]
		return trees, sentences

	lemmatizer, tagger = Lemmatizer(), POSTagger()

	trees, sentences = read_conll(train_file)
	tagged = tagger.batch_tag(sentences)

	train_data = train_file +'.data'
	with codecs.open(train_data, 'w', 'utf8') as output:
		for tree, sentence in zip(trees, tagged):
			for i, (node, word) in enumerate(zip(tree.nodelist[1:], sentence), start=1):
				node['tag'] = word[1]
				node['lemma'] = lemmatizer.lemmatize(node['word'].replace('_', ' '), node['tag'])
				print(i, node['word'].replace(' ', '_'), node['lemma'].replace(' ', '_'), node['tag'], node['tag'], '_', node['head'], node['rel'], '_', '_', sep='\t', file=output)
			print(file=output)

	cmd = ['java', memory_min, memory_max, '-jar', path_to_jar, '-w', 'resources', '-c', model_file, '-i', train_data, '-f', options_file, '-F', features_file, '-m', 'learn']
	process = subprocess.Popen(cmd)
	process.wait()

	# evaluation
	print('\nEvaluating trained model on test data:')
	parser = DependencyParser(tagger=tagger, model_file=model_file)

	trees, sentences = read_conll(test_file)
	tagged = tagger.batch_tag(sentences)
	parsed = parser.tagged_batch_parse(tagged)

	test_data, test_results = test_file +'.data', test_file +'.results'
	print('\n'.join([sentence.to_conll(10) for sentence in trees]).strip(), file=codecs.open(test_data, 'w', 'utf8'))
	print('\n'.join([sentence.to_conll(10) for sentence in parsed]).strip(), file=codecs.open(test_results, 'w', 'utf8'))

	cmd = ['java', '-jar', 'resources/MaltEval.jar', '-g', test_data, '-s', test_results]
	process = subprocess.Popen(cmd)
	process.wait()
Exemple #14
0
class HazmTokenizer(Component):
    defaults = {"stemmer": True, "lemmatizer": True, 'pos': False}

    def __init__(self, component_config: Dict[Text, Any] = None) -> None:

        super().__init__(component_config)
        if self.component_config.stemmer:
            self._stemmer = Stemmer()

        if self.component_config.lemmatizer:
            self._lemmatizer = Lemmatizer()

        if self.component_config.pos:
            self._pos_tagger = POSTagger(model='resources/postagger.model')

    def required_packages(self) -> List[Text]:
        return ['hazm']

    def process(self, message: Message, **kwargs: Any) -> None:
        text = message.text
        for sentence_str in sent_tokenize(text):
            sentence = Sentence(sentence_str)
            tokens = word_tokenize(sentence_str)
            pos_tags = []
            if self.component_config.pos:
                pos_tags = self._pos_tagger.tag(tokens)
            for idx, token_str in enumerate(tokens):
                token = Token(text=token_str)
                if self.component_config.stemmer:
                    token[TOKEN_ATTRIBUTE_STEM] = self._stemmer.stem(token_str)
                if self.component_config.lemmatizer:
                    token[TOKEN_ATTRIBUTE_LEMM] = self._lemmatizer.lemmatize(
                        token_str)
                if self.component_config.pos:
                    token[TOKEN_ATTRIBUTE_POS] = pos_tags[idx][1]
                sentence.add_token(token)
            message.add_sentence(sentence)
class HazmEngine(AlpheiosXmlEngine):
    def __init__(self, code, config, **kwargs):
        """ Constructor
        :param code: code
        :type code: str
        :param config: app config
        :type config: dict
        """
        super(HazmEngine, self).__init__(code, config, **kwargs)
        self.code = code
        self.config = config
        self.code = code
        self.oa_transformer = OaLegacyTransformer()
        self.language_codes = ['per', 'fas']
        self.uri = self.config['PARSERS_HAZM_URI']
        self.tagger = POSTagger(model=os.path.join(os.path.dirname(__file__),
                                                   'hazm', "postagger.model"))

    def lookup(self,
               word=None,
               word_uri=None,
               language=None,
               request_args=None,
               **kwargs):
        """ Word Lookup Function
        :param word: the word to lookup
        :type word: str
        :param word_uri: a uri for the word
        :type word_uri: str
        :param language: the language code for the word
        :type language: str
        :param request_args: dict of engine specific request arguments
        :type request_args: dict
        :return: the analysis
        :rtype: str
        """
        normalizer = Normalizer()
        item = normalizer.normalize(word)
        analyses = []
        stemmer = Stemmer()
        wordstem = stemmer.stem(item)
        wordtagged = self.tagger.tag(word_tokenize(item))
        wordpofs = wordtagged[0][1]
        wordpofs = self.maptohazm(wordpofs)
        analysis = {}
        analysis['entries'] = []
        entry = {}
        entry['dict'] = {}
        entry['dict']['hdwd'] = {}
        entry['dict']['hdwd']['lang'] = 'per'
        entry['dict']['hdwd']['text'] = wordstem
        entry['infls'] = []
        infl = {}
        infl['stem'] = {}
        infl['stem']['text'] = wordstem
        infl['stem']['lang'] = 'per'
        infl['pofs'] = {}
        if wordpofs:
            infl['pofs']['order'] = str(wordpofs[1])
            infl['pofs']['text'] = wordpofs[0]
        entry['infls'].append(infl)
        analysis['entries'].append(entry)
        analyses.append(analysis)
        return self.toalpheiosxml(analyses)

    def maptohazm(self, wordpofs):
        mapped = None
        if wordpofs == "N":
            mapped = ["noun", 1]
        elif wordpofs == "INT":
            mapped = ["interjection", 2]
        elif wordpofs == "DET":
            mapped = ["determiner", 3]
        elif wordpofs == "AJ":
            mapped = ["adjective", 4]
        elif wordpofs == "P":
            mapped = ["preposition", 5]
        elif wordpofs == "PRO":
            mapped = ["pronoun", 6]
        elif wordpofs == "CONJ":
            mapped = ["conjunction", 7]
        elif wordpofs == "V":
            mapped = ["verb", 8]
        elif wordpofs == "ADV":
            mapped = ["adverb", 9]
        elif wordpofs == "POSTP":
            mapped = ["postposition", 10]
        elif wordpofs == "Num":
            mapped = ["numeral", 11]
        elif wordpofs == "CL":
            mapped = ["classifier", 12]
        elif wordpofs == "e":
            mapped = ["ezafe", 13]
        return mapped

    def toalpheiosxml(self, analysis):
        '''
        represents an analysis in alpheios  xml format
        '''
        root = etree.Element('entries')
        for item in analysis:
            for entry in item['entries']:
                root.append(self.entrytoxml(entry))
        return root

    def entrytoxml(self, entry):
        '''
        represents an entry from an analysis in an xml fragment per the alpheios schema
        '''
        root = etree.Element('entry')
        dic = etree.SubElement(root, 'dict')
        hdwd = etree.SubElement(
            dic, 'hdwd', {
                '{http://www.w3.org/XML/1998/namespace}lang':
                entry['dict']['hdwd']['lang']
            })
        hdwd.text = entry['dict']['hdwd']['text']
        for i in entry['infls']:
            infl = etree.SubElement(root, 'infl')
            term = etree.SubElement(infl, 'term', {
                '{http://www.w3.org/XML/1998/namespace}lang':
                i['stem']['lang']
            })
            stem = etree.SubElement(term, 'stem')
            stem.text = i['stem']['text']
            if (i['pofs'] and i['pofs']['text']):
                pofs = etree.SubElement(infl, 'pofs',
                                        {'order': i['pofs']['order']})
                pofs.text = i['pofs']['text']
        return root
Exemple #16
0
# -*- coding: UTF-8 -*-

from hazm import word_tokenize, POSTagger, Stemmer, Chunker, tree2brackets

POSTAGGER_MODEL = 'resources/postagger.model'

tagger = POSTagger(model=POSTAGGER_MODEL)
chunker = Chunker(model='resources/chunker.model')

BLACK_LIST = [
    'RT',
    'برای',
    'این',
]


def is_word_ok(word):
    return len(word) >= 3 and word not in BLACK_LIST


def get_hash_tags(text):
    return set([word for word in text.strip().split() if word.strip().startswith('#')])


def get_names(text):
    tagged_words = tagger.tag(word_tokenize(text))
    words = set(filter(
        lambda word: is_word_ok(word),
        [tagged_word[0] for tagged_word in filter(lambda tagged_word: tagged_word[1] == 'N', tagged_words)]
    ))
Exemple #17
0
from hazm import sent_tokenize, word_tokenize, Normalizer, HamshahriReader, POSTagger, DependencyParser
from InformationExtractor import InformationExtractor
from progress.bar import Bar


hamshahri = HamshahriReader()
normalizer = Normalizer()
tagger = POSTagger()
parser = DependencyParser(tagger=tagger)
extractor = InformationExtractor()
texts = []

output = open('informations.txt', 'w')
for text in Bar(max=310000).iter(hamshahri.texts()):
	texts.append(normalizer.normalize(text))
	if len(texts) <= 1000: continue

	sentences = []
	for text in texts:
		for sentence in sent_tokenize(text):
			words = word_tokenize(sentence)
			if len(words) >= 3:
				sentences.append(words)
	texts = []

	tagged = tagger.batch_tag(sentences)
	parsed = parser.tagged_batch_parse(tagged)

	for sentence in parsed:
		# print('*', *[node['word'] for node in sentence.nodelist if node['word']], file=output)
Exemple #18
0
class Preprocess:
    def __init__(self,
                 corpus_path='resources/corpus.json',
                 symbols_json_path='resources/symbols.json',
                 persian_lang_path='resources/persian_lang.json',
                 postagger_model_path='resources/postagger.model',
                 max_keyword_num=10, min_keyword_occurrences=0.01, expand_corpus=False):
        self.postagger_model_path = postagger_model_path
        self.symbols_json_path = symbols_json_path
        self.corpus_path = corpus_path
        self.corpus = {}
        self.docs_num = 0
        self.expand_corpus = expand_corpus

        if self.corpus_path is not None:
            with open(corpus_path, encoding='utf-8') as json_file:
                corpus = json.load(json_file)
            self.corpus = corpus['corpus']
            self.docs_num = corpus['docs_num']

        with open(symbols_json_path, encoding='utf-8') as json_file:
            data = json.load(json_file)
        lst = list(data.values())
        self.all_symbols_list = [item for sublist in lst for item in sublist]

        with open(persian_lang_path, encoding='utf-8') as json_file:
            persian_lang = json.load(json_file)

        self.epic_keywords = persian_lang['epic_keywords']
        self.punctuations = persian_lang['punctuations']
        self.persian_alphabet = persian_lang['persian_alphabet']
        self.stop_words = persian_lang['stop_words']

        self.tagger = POSTagger(model=self.postagger_model_path)
        self.normalizer = Normalizer()
        self.max_keyword_num = max_keyword_num
        self.min_keyword_occurrences = min_keyword_occurrences

    def sort_corpus(self):
        self.corpus = {k: v for k, v in sorted(self.corpus.items(),
                                               key=lambda item: item[1], reverse=True)}
        return self.corpus

    def save_corpus(self, save_path):
        with open(save_path, 'w', encoding='utf8') as f:
            corpus = {'docs_num': self.docs_num, 'corpus': self.corpus}
            json.dump(corpus, f, ensure_ascii=False, indent=4, separators=(',', ': '))

    def get_ngrams(self, words, n):
        n_grams = ngrams(words, n)
        return [' '.join(grams) for grams in n_grams]

    def get_symbols(self, words):
        syms = []
        hashtags = []
        for word in words:
            if '#' in word:
                word = word.replace('#', '')
                hashtags.append(word)
                if word in self.all_symbols_list:
                    syms.append(word)
            else:
                if word in self.all_symbols_list:
                    syms.append(word)

        return syms, hashtags

    def calculate_tfidf(self, word, count_in_content, content_len):

        tf = count_in_content / content_len
        idf = math.log(self.docs_num / self.corpus.get(word, 1)) + 1
        return tf * idf

    def get_keywords(self, candidate_words, content_len):
        if self.expand_corpus:
            self.docs_num += 1
        tfidf_list = []
        keywords = []

        for word in list(set(candidate_words)):
            if self.expand_corpus:
                self.corpus[word] = self.corpus.get(word, 0) + 1
            if word in self.epic_keywords:
                keywords.append(word)
            else:
                count_in_content = candidate_words.count(word)
                tfidf = self.calculate_tfidf(word, count_in_content, content_len)
                if self.corpus.get(word, 0) > self.min_keyword_occurrences * self.docs_num:
                    tfidf_list.append((word, tfidf))

        sorted_keywords = sorted(tfidf_list, key=lambda x: x[1], reverse=True)
        keywords += ([kywrd.replace('#', '')
                      for (kywrd, score) in sorted_keywords
                      if score > 0.1])
        if len(keywords) == 0:
            return [kywrd for (kywrd, score) in sorted_keywords[:1]]
        return keywords[:self.max_keyword_num]

    def extract_metadata(self, tweet):
        important_words = []
        syms = []
        hashtags = []
        content_len = 0

        content = self.normalizer.normalize(tweet['content'])
        if 'های وب' in content: syms.append('های_وب')
        sentences = sent_tokenize(content)
        for sentence in sentences:
            sentence = sentence.translate(str.maketrans('', '', self.punctuations))

            words = word_tokenize(sentence)
            content_len += len(words)
            sent_syms, sent_hashs = self.get_symbols(words)
            syms += sent_syms
            hashtags += sent_hashs
            tags = self.tagger.tag(words)
            verbs = [word for (word, role) in tags if role == 'V']

            filtered_words = ([word.replace('#', '')
                               for word in words if word.replace('#', '') not in self.stop_words
                               and word.replace('#', '') not in verbs
                               and set(word.replace('#', '')).intersection(self.persian_alphabet)
                               and len(word.replace('#', '')) > 1])
            important_words += filtered_words
        syms = list(set(syms))
        hashtags = list(set(hashtags))
        bigrams = self.get_ngrams(important_words, 2)
        trigrams = self.get_ngrams(important_words, 3)
        candidate_words = hashtags + syms + important_words + bigrams + trigrams
        keywords = self.get_keywords(candidate_words, content_len)
        return keywords, syms, hashtags

    def get_compelete_json(self, tweet):
        content_and_metadata = {}
        keywords, symbols, hashtags = self.extract_metadata(tweet)
        content_and_metadata['id'] = tweet['id']
        content_and_metadata['sendTime'] = tweet['sendTime']
        content_and_metadata['sendTimePersian'] = tweet['sendTimePersian']
        content_and_metadata['hashtags'] = hashtags
        content_and_metadata['keywords'] = keywords
        content_and_metadata['symbols'] = symbols
        content_and_metadata['image'] = tweet['imageUid'] if 'imageUid' in tweet.keys() else []
        content_and_metadata['senderUsername'] = tweet['senderUsername']
        content_and_metadata['senderName'] = tweet['senderName']
        content_and_metadata['content'] = tweet['content']
        return content_and_metadata
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

from gensim.summarization.syntactic_unit import SyntacticUnit
from gensim.parsing.preprocessing import preprocess_documents
from gensim.utils import tokenize
from six.moves import xrange
import re
import logging
from hazm import *

logger = logging.getLogger('summa.preprocessing.cleaner')

try:
    #from pattern.en import tag
    from hazm import POSTagger
    tagger = POSTagger(model='resources/postagger.model')
    logger.info(
        "'pattern' package found; tag filters are available for Persian")
    HAS_PATTERN = True
except ImportError:
    #logger.info("'pattern' package not found; tag filters are not available for English")
    logger.info(
        "'pattern' package not found; tag filters are not available for Persian"
    )
    HAS_PATTERN = False

SEPARATOR = r'@'
RE_SENTENCE = re.compile(
    r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)',
    re.UNICODE)  # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)
AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE)
Exemple #20
0
class POS():
    def __init__(self, inFile, outFile):
        self.inFile = inFile
        self.outFile = outFile
        self.normalizer = Normalizer()
        self.tagger = POSTagger(model='resources/postagger.model')
        self.lemmatizer = Lemmatizer()
        self.stemmer = Stemmer()

    def posTaggerTXT(self):
        with open(self.outFile, 'w', encoding="utf8") as o:
            with open(self.inFile, 'r', encoding="utf8") as f:
                line = f.readline()
                while line:
                    line = line.strip()
                    line = self.normalizer.normalize(line)
                    tags = self.tagger.tag(word_tokenize(line))
                    for li in tags:
                        t = '{:20s} {:20s} {:20s} {:20s}\n'.format(
                            li[0], self.nameTag(li[1]),
                            self.lemmatizer.lemmatize(li[0]),
                            self.stemmer.stem(li[0]))
                        o.write(t)
                    line = f.readline()

    def posTaggerHTML(self):
        with open(self.outFile, 'w', encoding="utf8") as o:
            with open(self.inFile, 'r', encoding="utf8") as f:
                o.write(self.preHTML())
                line = f.readline()
                while line:
                    line = line.strip()
                    line = self.normalizer.normalize(line)
                    tags = self.tagger.tag(word_tokenize(line))
                    for li in tags:
                        t = '{:s} -//- {:s} -//- {:s} -//- {:s}\n'.format(
                            li[0], self.nameTag(li[1]),
                            self.lemmatizer.lemmatize(li[0]),
                            self.stemmer.stem(li[0]))
                        o.write(self.divHTML(self.colorTag(li[1]), t))
                        o.write("\n")
                    line = f.readline()
                o.write(self.posHTML())

    def nameTag(self, tag):
        if tag == "V":
            return "فعل"
        elif tag == "N":
            return "اسم"
        elif tag == "ADV":
            return "قید"
        elif tag == "PRO":
            return "ضمیر"
        elif tag == "PUNC":
            return "نشانه نگارشی"
        elif tag == "Ne":
            return "غیر قابل تشخیص"
        elif tag == "NUM":
            return "عدد"
        elif tag == "CONJ":
            return "حرف ربط"
        elif tag == "POSTP":
            return "نشانه مفعولی"
        elif tag == "P":
            return "حرف اضافه"
        elif tag == "AJ":
            return "صفت"
        elif tag == "DET":
            return "ضمیر اشاره"
        else:
            return tag

    def colorTag(self, tag):
        if tag == "V":
            return "red"
        elif tag == "N":
            return "hotpink"
        elif tag == "ADV":
            return "blue"
        elif tag == "PRO":
            return "gold"
        elif tag == "PUNC":
            return "lightblue"
        elif tag == "Ne":
            return "darkgray"
        elif tag == "NUM":
            return "white"
        elif tag == "CONJ":
            return "lightgreen"
        elif tag == "POSTP":
            return "white"
        elif tag == "P":
            return "aqua"
        elif tag == "AJ":
            return "teal"
        elif tag == "DET":
            return "slateblue"
        else:
            return "white"

    def preHTML(self):
        return """<!DOCTYPE html>
<head>
    <meta charset="UTF-8">
</head>
<body>
"""

    def posHTML(self):
        return """
    </body>
</html>"""

    def divHTML(self, color, text):
        return """
        <div style="background-color:""" + color + """">
        """ + """<h4>""" + text + """</h4>
        """ + """</div>