def __init__(self, inFile, outFile): self.inFile = inFile self.outFile = outFile self.normalizer = Normalizer() self.tagger = POSTagger(model='resources/postagger.model') self.lemmatizer = Lemmatizer() self.stemmer = Stemmer()
def __init__(self, component_config: Dict[Text, Any] = None) -> None: super().__init__(component_config) if self.component_config.stemmer: self._stemmer = Stemmer() if self.component_config.lemmatizer: self._lemmatizer = Lemmatizer() if self.component_config.pos: self._pos_tagger = POSTagger(model='resources/postagger.model')
def nouns(self, texts): total_count = len(texts) tagger = POSTagger() nouns = [] tagged_doc = tagger.tag_sents(texts) for sent in tagged_doc: sentence = [] for word, tag in sent: if tag == 'N': sentence.append(word) nouns.append(sentence) return nouns
def hazmtoalpheios(word,uri): wordslist = etree.Element("words") normalizer = Normalizer() data = normalizer.normalize(word) sentences = sent_tokenize(data) words = [] for sentence in sentences: if words: words = words.append(word_tokenize(sentence)) else: words = word_tokenize(sentence) analyses = [] for item in words: stemmer = Stemmer() wordstem = stemmer.stem(item) lemmatizer = Lemmatizer() wordlema = lemmatizer.lemmatize(item) if '#' in wordlema: worldleam, garbage = wordlema.split("#") tagger = POSTagger(model=os.path.join(model_path,"postagger.model")) wordtagged = tagger.tag(item) wordpofs = wordtagged[0][1] wordpofs = maptohazm(wordpofs) # a better way to do this would be to create a Python class # to formalize the abstraction analysis = {} analysis['engine'] = 'hazm' analysis['uri'] = uri analysis['form'] = {} analysis['form']['text'] = item analysis['form']['lang'] = 'per' analysis['entries'] = [] entry = {} entry['dict'] = {} entry['dict']['hdwd'] = {} entry['dict']['hdwd']['lang'] = 'per' entry['dict']['hdwd']['text'] = wordstem entry['infls'] = [] infl = {} infl['stem'] = {} infl['stem']['text'] = wordstem infl['stem']['lang'] = 'per' infl['pofs'] = {} if wordpofs: infl['pofs']['order'] = str(wordpofs[1]) infl['pofs']['text'] = wordpofs[0] entry['infls'].append(infl) analysis['entries'].append(entry) analyses.append(analysis) return analyses
def train_pos_tagger(bijankhan_file='resources/bijankhan.txt', path_to_model='resources/persian.tagger', path_to_jar='resources/stanford-postagger.jar', properties_file='resources/persian.tagger.props', memory_min='-Xms1g', memory_max='-Xmx2g', test_split=.1): bijankhan = BijankhanReader(bijankhan_file) train_file = 'resources/tagger_train_data.txt' output = codecs.open(train_file, 'w', 'utf8') sentences = list(bijankhan.sents()) train_part = int(len(sentences) * (1 - test_split)) for sentence in sentences[:train_part]: print(*(map(lambda w: '/'.join(w).replace(' ', '_'), sentence)), file=output) cmd = ['java', memory_min, memory_max, '-classpath', path_to_jar, 'edu.stanford.nlp.tagger.maxent.MaxentTagger', '-prop', properties_file, '-model', path_to_model, '-trainFile', train_file, '-tagSeparator', '/', '-search', 'owlqn2'] process = subprocess.Popen(cmd) process.wait() tagger = POSTagger() print('\n\n', 'Tagger Accuracy on Test Split:', tagger.evaluate(sentences[train_part:]))
def __init__(self, code, config, **kwargs): """ Constructor :param code: code :type code: str :param config: app config :type config: dict """ super(HazmEngine, self).__init__(code, config, **kwargs) self.code = code self.config = config self.code = code self.oa_transformer = OaLegacyTransformer() self.language_codes = ['per', 'fas'] self.uri = self.config['PARSERS_HAZM_URI'] self.tagger = POSTagger(model=os.path.join(os.path.dirname(__file__), 'hazm', "postagger.model"))
def worker(identifier, skip, count): tagger = POSTagger() done = 0 start = time.time() stopwords = load_stopwords() documents_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[ Settings.HAMSHAHRI_DATABASE][Settings.HAMSHAHRI_COLLECTION] tags_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[ Settings.TAGS_DATABASE][Settings.HAMSHAHRI_COLLECTION] batch_size = 50 for batch in range(0, count, batch_size): hamshahri_cursor = documents_collection.find().skip( skip + batch).limit(batch_size) for doc in hamshahri_cursor: words = [] sentences = sent_tokenize(doc['text']) sents = [] for sentence in sentences: tokens = word_tokenize(sentence) text = [word for word in tokens if word not in stopwords] sents.append(text) tags = tagger.tag_sents(sents) for sent in tags: for word, tag in sent: words.append({'word': word, "pos": tag}) tags_collection.insert({ "id": doc["id"], "categories_fa": doc["categories_fa"], "text": doc["text"], "words": words }) done += 1 #if done % 100 == 0: end = time.time() print 'Worker' + str(identifier) + ': Done ' + str( done) + ' out of ' + str(count) + ' in ' + ( "%.2f" % (end - start)) + ' sec ~ ' + ("%.2f" % (done / (end - start))) + '/sec' sys.stdout.flush()
def hazmtoalpheios(word,uri): wordslist = etree.Element("words") normalizer = Normalizer() item = normalizer.normalize(word) analyses = [] stemmer = Stemmer() wordstem = stemmer.stem(item) lemmatizer = Lemmatizer() wordlema = lemmatizer.lemmatize(item) if '#' in wordlema: worldleam, garbage = wordlema.split("#") tagger = POSTagger(model=os.path.join(model_path,"postagger.model")) wordtagged = tagger.tag(word_tokenize(item)) wordpofs = wordtagged[0][1] wordpofs = maptohazm(wordpofs) # a better way to do this would be to create a Python class # to formalize the abstraction analysis = {} analysis['engine'] = 'hazm' analysis['uri'] = uri analysis['form'] = {} analysis['form']['text'] = item analysis['form']['lang'] = 'per' analysis['entries'] = [] entry = {} entry['dict'] = {} entry['dict']['hdwd'] = {} entry['dict']['hdwd']['lang'] = 'per' entry['dict']['hdwd']['text'] = wordstem entry['infls'] = [] infl = {} infl['stem'] = {} infl['stem']['text'] = wordstem infl['stem']['lang'] = 'per' infl['pofs'] = {} if wordpofs: infl['pofs']['order'] = str(wordpofs[1]) infl['pofs']['text'] = wordpofs[0] entry['infls'].append(infl) analysis['entries'].append(entry) analyses.append(analysis) return analyses
def hazmtoalpheiosfile(data,uri): root = etree.Element("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF") oaannotation = etree.SubElement(root,'{http://www.w3.org/ns/oa#}Annotation',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':'http://services.projectbamboo.org/morphology'+uri}) oahasbody = etree.SubElement(oaannotation, '{http://www.w3.org/ns/oa#}hasBody',) oahastarget = etree.SubElement(oaannotation,'{http://www.w3.org/ns/oa#}hasTarget') hasbodydesc = etree.SubElement(oahastarget,'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri}) ispartof = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}isPartOf',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri}) source = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}source',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource':uri}) title = etree.SubElement(oaannotation, '{http://purl.org/dc/elements/1.1/}title', {'{http://www.w3.org/XML/1998/namespace}lang':'eng'}) title.text = "Morphology of " + uri wordslist = etree.SubElement("words") normalizer = Normalizer() data = normalizer.normalize(data) sentences = sent_tokenize(data) words = [] for sentence in sentences: if words: words = words.append(word_tokenize(sentence)) else: words = word_tokenize(sentence) for item in words: stemmer = Stemmer() wordstem = stemmer.stem(item) lemmatizer = Lemmatizer() wordlema = lemmatizer.lemmatize(item) if '#' in wordlema: worldleam, garbage = wordlema.split("#") tagger = POSTagger(model=os.path.join(model_path,"postagger.model")) wordtagged = tagger.tag(item) wordpofs = wordtagged[0][1] word = etree.SubElement(wordslist,'word') form = etree.SubElement(word, 'form', {'{http://www.w3.org/XML/1998/namespace}lang':'per'}) form.text = item entry = etree.SubElement(word, 'entry') infl = etree.SubElement(entry,'inlf') term = etree.SubElement(infl, 'term', {'{http://www.w3.org/XML/1998/namespace}lang':'per'}) stem = etree.SubElement(term, 'stem') stem.text = wordstem pofs = etree.SubElement(infl, 'pofs') pofs.text = wordpofs return root
def __init__(self, corpus_path='resources/corpus.json', symbols_json_path='resources/symbols.json', persian_lang_path='resources/persian_lang.json', postagger_model_path='resources/postagger.model', max_keyword_num=10, min_keyword_occurrences=0.01, expand_corpus=False): self.postagger_model_path = postagger_model_path self.symbols_json_path = symbols_json_path self.corpus_path = corpus_path self.corpus = {} self.docs_num = 0 self.expand_corpus = expand_corpus if self.corpus_path is not None: with open(corpus_path, encoding='utf-8') as json_file: corpus = json.load(json_file) self.corpus = corpus['corpus'] self.docs_num = corpus['docs_num'] with open(symbols_json_path, encoding='utf-8') as json_file: data = json.load(json_file) lst = list(data.values()) self.all_symbols_list = [item for sublist in lst for item in sublist] with open(persian_lang_path, encoding='utf-8') as json_file: persian_lang = json.load(json_file) self.epic_keywords = persian_lang['epic_keywords'] self.punctuations = persian_lang['punctuations'] self.persian_alphabet = persian_lang['persian_alphabet'] self.stop_words = persian_lang['stop_words'] self.tagger = POSTagger(model=self.postagger_model_path) self.normalizer = Normalizer() self.max_keyword_num = max_keyword_num self.min_keyword_occurrences = min_keyword_occurrences
def __init__(self, question, useStemmer = False, useSynonyms = False, removeStopwords = False): self.question = question self.useStemmer = useStemmer self.useSynonyms = useSynonyms self.removeStopwords = removeStopwords self.stopWords = stopwords.words("english") self.stem = lambda k : k.lower() if self.useStemmer: ps = PorterStemmer() self.stem = ps.stem self.qType = self.determineQuestionType(question) self.searchQuery = self.buildSearchQuery(question) self.qVector = self.getQueryVector(self.searchQuery) self.aType = self.determineAnswerType(question) post = POSTagger()
def train_dependency_parser(train_file='resources/train.conll', test_file='resources/test.conll', model_file='langModel.mco', path_to_jar='resources/malt.jar', options_file='resources/options.xml', features_file='resources/features.xml', memory_min='-Xms7g', memory_max='-Xmx8g'): def read_conll(conll_file): trees = [DependencyGraph(item) for item in dadegan_text(conll_file).replace(' ', '_').split('\n\n') if item.strip()] sentences = [[node['word'] for node in tree.nodelist[1:]] for tree in trees] return trees, sentences lemmatizer, tagger = Lemmatizer(), POSTagger() trees, sentences = read_conll(train_file) tagged = tagger.batch_tag(sentences) train_data = train_file +'.data' with codecs.open(train_data, 'w', 'utf8') as output: for tree, sentence in zip(trees, tagged): for i, (node, word) in enumerate(zip(tree.nodelist[1:], sentence), start=1): node['tag'] = word[1] node['lemma'] = lemmatizer.lemmatize(node['word'].replace('_', ' '), node['tag']) print(i, node['word'].replace(' ', '_'), node['lemma'].replace(' ', '_'), node['tag'], node['tag'], '_', node['head'], node['rel'], '_', '_', sep='\t', file=output) print(file=output) cmd = ['java', memory_min, memory_max, '-jar', path_to_jar, '-w', 'resources', '-c', model_file, '-i', train_data, '-f', options_file, '-F', features_file, '-m', 'learn'] process = subprocess.Popen(cmd) process.wait() # evaluation print('\nEvaluating trained model on test data:') parser = DependencyParser(tagger=tagger, model_file=model_file) trees, sentences = read_conll(test_file) tagged = tagger.batch_tag(sentences) parsed = parser.tagged_batch_parse(tagged) test_data, test_results = test_file +'.data', test_file +'.results' print('\n'.join([sentence.to_conll(10) for sentence in trees]).strip(), file=codecs.open(test_data, 'w', 'utf8')) print('\n'.join([sentence.to_conll(10) for sentence in parsed]).strip(), file=codecs.open(test_results, 'w', 'utf8')) cmd = ['java', '-jar', 'resources/MaltEval.jar', '-g', test_data, '-s', test_results] process = subprocess.Popen(cmd) process.wait()
class HazmTokenizer(Component): defaults = {"stemmer": True, "lemmatizer": True, 'pos': False} def __init__(self, component_config: Dict[Text, Any] = None) -> None: super().__init__(component_config) if self.component_config.stemmer: self._stemmer = Stemmer() if self.component_config.lemmatizer: self._lemmatizer = Lemmatizer() if self.component_config.pos: self._pos_tagger = POSTagger(model='resources/postagger.model') def required_packages(self) -> List[Text]: return ['hazm'] def process(self, message: Message, **kwargs: Any) -> None: text = message.text for sentence_str in sent_tokenize(text): sentence = Sentence(sentence_str) tokens = word_tokenize(sentence_str) pos_tags = [] if self.component_config.pos: pos_tags = self._pos_tagger.tag(tokens) for idx, token_str in enumerate(tokens): token = Token(text=token_str) if self.component_config.stemmer: token[TOKEN_ATTRIBUTE_STEM] = self._stemmer.stem(token_str) if self.component_config.lemmatizer: token[TOKEN_ATTRIBUTE_LEMM] = self._lemmatizer.lemmatize( token_str) if self.component_config.pos: token[TOKEN_ATTRIBUTE_POS] = pos_tags[idx][1] sentence.add_token(token) message.add_sentence(sentence)
class HazmEngine(AlpheiosXmlEngine): def __init__(self, code, config, **kwargs): """ Constructor :param code: code :type code: str :param config: app config :type config: dict """ super(HazmEngine, self).__init__(code, config, **kwargs) self.code = code self.config = config self.code = code self.oa_transformer = OaLegacyTransformer() self.language_codes = ['per', 'fas'] self.uri = self.config['PARSERS_HAZM_URI'] self.tagger = POSTagger(model=os.path.join(os.path.dirname(__file__), 'hazm', "postagger.model")) def lookup(self, word=None, word_uri=None, language=None, request_args=None, **kwargs): """ Word Lookup Function :param word: the word to lookup :type word: str :param word_uri: a uri for the word :type word_uri: str :param language: the language code for the word :type language: str :param request_args: dict of engine specific request arguments :type request_args: dict :return: the analysis :rtype: str """ normalizer = Normalizer() item = normalizer.normalize(word) analyses = [] stemmer = Stemmer() wordstem = stemmer.stem(item) wordtagged = self.tagger.tag(word_tokenize(item)) wordpofs = wordtagged[0][1] wordpofs = self.maptohazm(wordpofs) analysis = {} analysis['entries'] = [] entry = {} entry['dict'] = {} entry['dict']['hdwd'] = {} entry['dict']['hdwd']['lang'] = 'per' entry['dict']['hdwd']['text'] = wordstem entry['infls'] = [] infl = {} infl['stem'] = {} infl['stem']['text'] = wordstem infl['stem']['lang'] = 'per' infl['pofs'] = {} if wordpofs: infl['pofs']['order'] = str(wordpofs[1]) infl['pofs']['text'] = wordpofs[0] entry['infls'].append(infl) analysis['entries'].append(entry) analyses.append(analysis) return self.toalpheiosxml(analyses) def maptohazm(self, wordpofs): mapped = None if wordpofs == "N": mapped = ["noun", 1] elif wordpofs == "INT": mapped = ["interjection", 2] elif wordpofs == "DET": mapped = ["determiner", 3] elif wordpofs == "AJ": mapped = ["adjective", 4] elif wordpofs == "P": mapped = ["preposition", 5] elif wordpofs == "PRO": mapped = ["pronoun", 6] elif wordpofs == "CONJ": mapped = ["conjunction", 7] elif wordpofs == "V": mapped = ["verb", 8] elif wordpofs == "ADV": mapped = ["adverb", 9] elif wordpofs == "POSTP": mapped = ["postposition", 10] elif wordpofs == "Num": mapped = ["numeral", 11] elif wordpofs == "CL": mapped = ["classifier", 12] elif wordpofs == "e": mapped = ["ezafe", 13] return mapped def toalpheiosxml(self, analysis): ''' represents an analysis in alpheios xml format ''' root = etree.Element('entries') for item in analysis: for entry in item['entries']: root.append(self.entrytoxml(entry)) return root def entrytoxml(self, entry): ''' represents an entry from an analysis in an xml fragment per the alpheios schema ''' root = etree.Element('entry') dic = etree.SubElement(root, 'dict') hdwd = etree.SubElement( dic, 'hdwd', { '{http://www.w3.org/XML/1998/namespace}lang': entry['dict']['hdwd']['lang'] }) hdwd.text = entry['dict']['hdwd']['text'] for i in entry['infls']: infl = etree.SubElement(root, 'infl') term = etree.SubElement(infl, 'term', { '{http://www.w3.org/XML/1998/namespace}lang': i['stem']['lang'] }) stem = etree.SubElement(term, 'stem') stem.text = i['stem']['text'] if (i['pofs'] and i['pofs']['text']): pofs = etree.SubElement(infl, 'pofs', {'order': i['pofs']['order']}) pofs.text = i['pofs']['text'] return root
# -*- coding: UTF-8 -*- from hazm import word_tokenize, POSTagger, Stemmer, Chunker, tree2brackets POSTAGGER_MODEL = 'resources/postagger.model' tagger = POSTagger(model=POSTAGGER_MODEL) chunker = Chunker(model='resources/chunker.model') BLACK_LIST = [ 'RT', 'برای', 'این', ] def is_word_ok(word): return len(word) >= 3 and word not in BLACK_LIST def get_hash_tags(text): return set([word for word in text.strip().split() if word.strip().startswith('#')]) def get_names(text): tagged_words = tagger.tag(word_tokenize(text)) words = set(filter( lambda word: is_word_ok(word), [tagged_word[0] for tagged_word in filter(lambda tagged_word: tagged_word[1] == 'N', tagged_words)] ))
from hazm import sent_tokenize, word_tokenize, Normalizer, HamshahriReader, POSTagger, DependencyParser from InformationExtractor import InformationExtractor from progress.bar import Bar hamshahri = HamshahriReader() normalizer = Normalizer() tagger = POSTagger() parser = DependencyParser(tagger=tagger) extractor = InformationExtractor() texts = [] output = open('informations.txt', 'w') for text in Bar(max=310000).iter(hamshahri.texts()): texts.append(normalizer.normalize(text)) if len(texts) <= 1000: continue sentences = [] for text in texts: for sentence in sent_tokenize(text): words = word_tokenize(sentence) if len(words) >= 3: sentences.append(words) texts = [] tagged = tagger.batch_tag(sentences) parsed = parser.tagged_batch_parse(tagged) for sentence in parsed: # print('*', *[node['word'] for node in sentence.nodelist if node['word']], file=output)
class Preprocess: def __init__(self, corpus_path='resources/corpus.json', symbols_json_path='resources/symbols.json', persian_lang_path='resources/persian_lang.json', postagger_model_path='resources/postagger.model', max_keyword_num=10, min_keyword_occurrences=0.01, expand_corpus=False): self.postagger_model_path = postagger_model_path self.symbols_json_path = symbols_json_path self.corpus_path = corpus_path self.corpus = {} self.docs_num = 0 self.expand_corpus = expand_corpus if self.corpus_path is not None: with open(corpus_path, encoding='utf-8') as json_file: corpus = json.load(json_file) self.corpus = corpus['corpus'] self.docs_num = corpus['docs_num'] with open(symbols_json_path, encoding='utf-8') as json_file: data = json.load(json_file) lst = list(data.values()) self.all_symbols_list = [item for sublist in lst for item in sublist] with open(persian_lang_path, encoding='utf-8') as json_file: persian_lang = json.load(json_file) self.epic_keywords = persian_lang['epic_keywords'] self.punctuations = persian_lang['punctuations'] self.persian_alphabet = persian_lang['persian_alphabet'] self.stop_words = persian_lang['stop_words'] self.tagger = POSTagger(model=self.postagger_model_path) self.normalizer = Normalizer() self.max_keyword_num = max_keyword_num self.min_keyword_occurrences = min_keyword_occurrences def sort_corpus(self): self.corpus = {k: v for k, v in sorted(self.corpus.items(), key=lambda item: item[1], reverse=True)} return self.corpus def save_corpus(self, save_path): with open(save_path, 'w', encoding='utf8') as f: corpus = {'docs_num': self.docs_num, 'corpus': self.corpus} json.dump(corpus, f, ensure_ascii=False, indent=4, separators=(',', ': ')) def get_ngrams(self, words, n): n_grams = ngrams(words, n) return [' '.join(grams) for grams in n_grams] def get_symbols(self, words): syms = [] hashtags = [] for word in words: if '#' in word: word = word.replace('#', '') hashtags.append(word) if word in self.all_symbols_list: syms.append(word) else: if word in self.all_symbols_list: syms.append(word) return syms, hashtags def calculate_tfidf(self, word, count_in_content, content_len): tf = count_in_content / content_len idf = math.log(self.docs_num / self.corpus.get(word, 1)) + 1 return tf * idf def get_keywords(self, candidate_words, content_len): if self.expand_corpus: self.docs_num += 1 tfidf_list = [] keywords = [] for word in list(set(candidate_words)): if self.expand_corpus: self.corpus[word] = self.corpus.get(word, 0) + 1 if word in self.epic_keywords: keywords.append(word) else: count_in_content = candidate_words.count(word) tfidf = self.calculate_tfidf(word, count_in_content, content_len) if self.corpus.get(word, 0) > self.min_keyword_occurrences * self.docs_num: tfidf_list.append((word, tfidf)) sorted_keywords = sorted(tfidf_list, key=lambda x: x[1], reverse=True) keywords += ([kywrd.replace('#', '') for (kywrd, score) in sorted_keywords if score > 0.1]) if len(keywords) == 0: return [kywrd for (kywrd, score) in sorted_keywords[:1]] return keywords[:self.max_keyword_num] def extract_metadata(self, tweet): important_words = [] syms = [] hashtags = [] content_len = 0 content = self.normalizer.normalize(tweet['content']) if 'های وب' in content: syms.append('های_وب') sentences = sent_tokenize(content) for sentence in sentences: sentence = sentence.translate(str.maketrans('', '', self.punctuations)) words = word_tokenize(sentence) content_len += len(words) sent_syms, sent_hashs = self.get_symbols(words) syms += sent_syms hashtags += sent_hashs tags = self.tagger.tag(words) verbs = [word for (word, role) in tags if role == 'V'] filtered_words = ([word.replace('#', '') for word in words if word.replace('#', '') not in self.stop_words and word.replace('#', '') not in verbs and set(word.replace('#', '')).intersection(self.persian_alphabet) and len(word.replace('#', '')) > 1]) important_words += filtered_words syms = list(set(syms)) hashtags = list(set(hashtags)) bigrams = self.get_ngrams(important_words, 2) trigrams = self.get_ngrams(important_words, 3) candidate_words = hashtags + syms + important_words + bigrams + trigrams keywords = self.get_keywords(candidate_words, content_len) return keywords, syms, hashtags def get_compelete_json(self, tweet): content_and_metadata = {} keywords, symbols, hashtags = self.extract_metadata(tweet) content_and_metadata['id'] = tweet['id'] content_and_metadata['sendTime'] = tweet['sendTime'] content_and_metadata['sendTimePersian'] = tweet['sendTimePersian'] content_and_metadata['hashtags'] = hashtags content_and_metadata['keywords'] = keywords content_and_metadata['symbols'] = symbols content_and_metadata['image'] = tweet['imageUid'] if 'imageUid' in tweet.keys() else [] content_and_metadata['senderUsername'] = tweet['senderUsername'] content_and_metadata['senderName'] = tweet['senderName'] content_and_metadata['content'] = tweet['content'] return content_and_metadata
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html from gensim.summarization.syntactic_unit import SyntacticUnit from gensim.parsing.preprocessing import preprocess_documents from gensim.utils import tokenize from six.moves import xrange import re import logging from hazm import * logger = logging.getLogger('summa.preprocessing.cleaner') try: #from pattern.en import tag from hazm import POSTagger tagger = POSTagger(model='resources/postagger.model') logger.info( "'pattern' package found; tag filters are available for Persian") HAS_PATTERN = True except ImportError: #logger.info("'pattern' package not found; tag filters are not available for English") logger.info( "'pattern' package not found; tag filters are not available for Persian" ) HAS_PATTERN = False SEPARATOR = r'@' RE_SENTENCE = re.compile( r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$) AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE)
class POS(): def __init__(self, inFile, outFile): self.inFile = inFile self.outFile = outFile self.normalizer = Normalizer() self.tagger = POSTagger(model='resources/postagger.model') self.lemmatizer = Lemmatizer() self.stemmer = Stemmer() def posTaggerTXT(self): with open(self.outFile, 'w', encoding="utf8") as o: with open(self.inFile, 'r', encoding="utf8") as f: line = f.readline() while line: line = line.strip() line = self.normalizer.normalize(line) tags = self.tagger.tag(word_tokenize(line)) for li in tags: t = '{:20s} {:20s} {:20s} {:20s}\n'.format( li[0], self.nameTag(li[1]), self.lemmatizer.lemmatize(li[0]), self.stemmer.stem(li[0])) o.write(t) line = f.readline() def posTaggerHTML(self): with open(self.outFile, 'w', encoding="utf8") as o: with open(self.inFile, 'r', encoding="utf8") as f: o.write(self.preHTML()) line = f.readline() while line: line = line.strip() line = self.normalizer.normalize(line) tags = self.tagger.tag(word_tokenize(line)) for li in tags: t = '{:s} -//- {:s} -//- {:s} -//- {:s}\n'.format( li[0], self.nameTag(li[1]), self.lemmatizer.lemmatize(li[0]), self.stemmer.stem(li[0])) o.write(self.divHTML(self.colorTag(li[1]), t)) o.write("\n") line = f.readline() o.write(self.posHTML()) def nameTag(self, tag): if tag == "V": return "فعل" elif tag == "N": return "اسم" elif tag == "ADV": return "قید" elif tag == "PRO": return "ضمیر" elif tag == "PUNC": return "نشانه نگارشی" elif tag == "Ne": return "غیر قابل تشخیص" elif tag == "NUM": return "عدد" elif tag == "CONJ": return "حرف ربط" elif tag == "POSTP": return "نشانه مفعولی" elif tag == "P": return "حرف اضافه" elif tag == "AJ": return "صفت" elif tag == "DET": return "ضمیر اشاره" else: return tag def colorTag(self, tag): if tag == "V": return "red" elif tag == "N": return "hotpink" elif tag == "ADV": return "blue" elif tag == "PRO": return "gold" elif tag == "PUNC": return "lightblue" elif tag == "Ne": return "darkgray" elif tag == "NUM": return "white" elif tag == "CONJ": return "lightgreen" elif tag == "POSTP": return "white" elif tag == "P": return "aqua" elif tag == "AJ": return "teal" elif tag == "DET": return "slateblue" else: return "white" def preHTML(self): return """<!DOCTYPE html> <head> <meta charset="UTF-8"> </head> <body> """ def posHTML(self): return """ </body> </html>""" def divHTML(self, color, text): return """ <div style="background-color:""" + color + """"> """ + """<h4>""" + text + """</h4> """ + """</div>