Esempio n. 1
0
def tokenize(corpus, lemma=True, punctuation=True, space_to_space=False):

    if (not punctuation):
        # table = str.maketrans({key: None for key in string.punctuation})
        # corpus = corpus.translate(table)
        corpus = corpus.replace(',', ' ')
        corpus = corpus.replace("\u220c", "")
        corpus = corpus.replace('(', ' ')
        corpus = corpus.replace(')', ' ')
        corpus = corpus.replace('.', ' ')
        corpus = corpus.replace("،", " ")
        corpus = corpus.replace("«", " ")
        corpus = corpus.replace("»", " ")

    if (space_to_space):
        tokenized = corpus.split(' ')
    else:
        tokenized = word_tokenize(corpus)

    if (lemma):
        lemmatizer = Lemmatizer()
        for i in range(len(tokenized)):
            tokenized[i] = lemmatizer.lemmatize(tokenized[i]).split('#')[0]

    return tokenized
Esempio n. 2
0
def stemming_and_lemmatization(token):
    stemmer = Stemmer()
    lemmatizer = Lemmatizer()

    stemmed = stemmer.stem(token)
    lemmatized = lemmatizer.lemmatize(stemmed)
    return lemmatized
Esempio n. 3
0
def evaluate_lemmatizer(conll_file='resources/train.conll', bijankhan_file='resources/bijankhan.txt'):
	lemmatizer = Lemmatizer()

	errors = []
	output = codecs.open('resources/lemmatizer_errors.txt', 'w', 'utf8')
	for line in dadegan_text(conll_file).split('\n'):
		parts = line.split('\t')
		if len(parts) < 10:
			continue
		word, lemma, pos = parts[1], parts[2], parts[3]
		if lemmatizer.lemmatize(word, pos) != lemma:
			errors.append((word, lemma, pos, lemmatizer.lemmatize(word, pos)))
	print(len(errors), 'errors', file=output)
	counter = Counter(errors)
	for item, count in sorted(counter.items(), key=lambda t: t[1], reverse=True):
		print(count, *item, file=output)

	missed = []
	output = codecs.open('resources/lemmatizer_missed.txt', 'w', 'utf8')
	bijankhan = BijankhanReader(bijankhan_file)
	for sentence in bijankhan.sents():
		for word in sentence:
			if word[1] == 'V':
				if word[0] == lemmatizer.lemmatize(word[0]):
					missed.append(word[0])
	print(len(missed), 'missed', file=output)
	counter = Counter(missed)
	for item, count in sorted(counter.items(), key=lambda t: t[1], reverse=True):
		print(count, item, file=output)
Esempio n. 4
0
 def __init__(self, inFile, outFile):
     self.inFile = inFile
     self.outFile = outFile
     self.normalizer = Normalizer()
     self.tagger = POSTagger(model='resources/postagger.model')
     self.lemmatizer = Lemmatizer()
     self.stemmer = Stemmer()
Esempio n. 5
0
 def get_lemmatizer(self, document):
     ''' Lemmatizer '''
     content = self.clear_document(document)
     result = self.split_document(content)    
     lemmatizer = Lemmatizer()
     lemma_set = [(item, lemmatizer.lemmatize(item)) for item in result]
     return lemma_set
Esempio n. 6
0
def lemmatize(target_string):
    lemmatized_string = ""
    lemmatizer = Lemmatizer()

    for single_word in target_string.split():
        lemmatized_string += lemmatizer.lemmatize(single_word) + " "

    return lemmatized_string
def statement_pre_processing(input_statement):
    normalizer = Normalizer()
    lemmatizer = Lemmatizer()
    input_statement = normalizer.normalize(input_statement)
    input_statement = [
        lemmatizer.lemmatize(word) for word in word_tokenize(input_statement)
        if word not in stops
    ]
    return input_statement
Esempio n. 8
0
    def __init__(self, component_config: Dict[Text, Any] = None) -> None:

        super().__init__(component_config)
        if self.component_config.stemmer:
            self._stemmer = Stemmer()

        if self.component_config.lemmatizer:
            self._lemmatizer = Lemmatizer()

        if self.component_config.pos:
            self._pos_tagger = POSTagger(model='resources/postagger.model')
Esempio n. 9
0
def preprocess(doc):
    stemmer = Stemmer()
    lemmatizer = Lemmatizer()
    normalizer = Normalizer()
    doc = normalizer.normalize(doc)
    tokenized = re.split(' |-', doc)
    for w in tokenized[:]:
        if w in stopwords:
            tokenized.remove(w)
    stemmed = [stemmer.stem(w) for w in tokenized]
    new_words = [word for word in stemmed if word.isalnum()]
    lemmatized = [lemmatizer.lemmatize(w) for w in new_words]
    return lemmatized
def dataset_cleaner(dataset):
    statements = []
    normalizer = Normalizer()
    lemmatizer = Lemmatizer()
    for i in range(len(dataset)):
        normalized_statement = normalizer.normalize(dataset[i])
        # for sentence in sent_tokenize(dataset[i]):
        word_list = [
            lemmatizer.lemmatize(word)
            for word in word_tokenize(normalized_statement)
            if word not in stops
        ]
        statements.append(word_list)
    return statements
Esempio n. 11
0
def find_tokens_in_sentence(sentence_ner, sentence_ner_lem):
    tokens_lem = []
    for token in sentence_ner_lem:
        if len(tokens_lem) > 0:
            if token['word'].startswith('##'):
                tokens_lem[-1]['word'] += ' ' + token['word'][2:]
                tokens_lem[-1]['index'] += 1
            elif token['entity'].split(
                    '-')[1] == tokens_lem[-1]['entity_group'] and token[
                        'index'] == tokens_lem[-1]['index'] + 1:
                tokens_lem[-1]['word'] += ' ' + token['word']
                tokens_lem[-1]['index'] += 1
            else:
                tokens_lem += [{
                    'word': Lemmatizer().lemmatize(token['word']),
                    'entity_group': token['entity'].split('-')[1],
                    'index': token['index']
                }]
        else:
            tokens_lem += [{
                'word': Lemmatizer().lemmatize(token['word']),
                'entity_group': token['entity'].split('-')[1],
                'index': token['index']
            }]

    tokens = []
    for token in sentence_ner:

        if len(tokens) > 0:
            if token['word'].startswith('##'):
                tokens[-1]['word'] += ' ' + token['word'][2:]
                tokens[-1]['index'] += 1
            elif token['entity'].split(
                    '-')[1] == tokens[-1]['entity_group'] and token[
                        'index'] == tokens[-1]['index'] + 1:
                tokens[-1]['word'] += ' ' + token['word']
                tokens[-1]['index'] += 1
            else:
                tokens += [{
                    'word': token['word'],
                    'entity_group': token['entity'].split('-')[1],
                    'index': token['index']
                }]
        else:
            tokens += [{
                'word': token['word'],
                'entity_group': token['entity'].split('-')[1],
                'index': token['index']
            }]
    return tokens, tokens_lem
def stemmer(email):
    """
    :param email: a string of email text
    :return: a string of input in which for each verb it's root has been replaced
    """
    tokens = ''
    for word in email.split():
        token = Lemmatizer.lemmatize(word)
        if '#' in token:
            token = token.split('#')
            if word in token[0]:
                token = token[0]
            else:
                token = token[1]
        else:
            token = Stemmer.convert_to_stem(word)
            if '&' in token:
                token = token.split('&')
                if word in token[0]:
                    token = token[0]
                else:
                    token = token[1]

        tokens += token + ' '

    return tokens
Esempio n. 13
0
class LemmaFilter(Filter):
	def __init__(self):
		self.lemmatizer = Lemmatizer()

	def __call__(self, tokens):
		for token in tokens:
			token.text = self.lemmatizer.lemmatize(token.text)
			yield token
Esempio n. 14
0
 def TextCleaner(self):
     self.stopwordsList= ''
     Data = self.imported_data
     stemmer = Stemmer()
     lemmatizer = Lemmatizer()
     dataList = Data
     table = str.maketrans('', '', punctuation)
     
     for i in range(0, len(dataList)):
         for j in range(0, len(dataList[i][0])):
             dataList[i][0][j] = stemmer.stem(dataList[i][0][j])                
             dataList[i][0][j] = lemmatizer.lemmatize(dataList[i][0][j])
         dataList[i][0] = [word for word in dataList[i][0] if word.isalpha()]
         dataList[i][0]= [w.translate(table) for w in dataList[i][0]]
         dataList[i][0] = [word for word in dataList[i][0] if len(word) > 3]
     self.imported_data = dataList
     return self.imported_data
 def lemmatize(self):
     """
     :return:
     """
     lemmatizer = Lemmatizer()
     for words in self.words:
         temp = []
         for word in words:
             word_lemma = lemmatizer.lemmatize(word)
             if word_lemma is not None:
                 if "#" in word_lemma:
                     temp.append(word_lemma.split("#")[1])
                 else:
                     temp.append(word_lemma)
             else:
                 temp.append(word)
         self.lemmatized_words.append(temp)
     return self.lemmatized_words
Esempio n. 16
0
def perform_word_lemmatization(data_dict):
    from hazm import Lemmatizer
    lemmatizer = Lemmatizer()

    return_value = {}
    for folder_name in data_dict.keys():
        return_value[folder_name] = {}
        for file_name in data_dict[folder_name].keys():
            this_files_words = []
            for sent_text in data_dict[folder_name][file_name]:
                this_sentences_words = []
                for word in sent_text:
                    lemma_word = lemmatizer.lemmatize(word)
                    this_sentences_words.append(lemma_word)
                this_files_words.append(this_sentences_words)
            return_value[folder_name][file_name] = this_files_words

    return return_value
Esempio n. 17
0
def process_text(text):
    normalize=Normalizer()
    text=normalize.normalize(text)
    text = text.replace("_", " ")
    text = text.replace(',', ' ')
    text=text.replace("\u220c","")
    text=text.replace("\u200c","")
    text=text.replace("-","")
    # text = text.replace('/', ' ')
    text = text.replace('(', ' ')
    text = text.replace(')', ' ')
    text = text.replace('.', ' ')
    text=text.replace("،"," ")
    text=text.replace("«"," ")
    text=text.replace("»"," ")
    # Convert text string to a list of words
    t = re.findall("[\u0627-\u06FF]+|<S>|</s>|\?|//", text)  # just split word by space to space and omit other thing
    lemma=Lemmatizer()
    text=[lemma.lemmatize(x) for x in t]
    return text
Esempio n. 18
0
def tokenizer(input_var):
    tokenized = []
    normalizer1 = Normalizer(True, False, False)
    normalizer2 = Normalizer(False, True, False)
    normalizer3 = Normalizer(False, False, True)
    word_tokenizer = WordTokenizer(False)
    input_var = normalizer1.normalize(
        normalizer2.normalize(normalizer3.normalize(input_var)))
    actual = word_tokenizer.tokenize(input_var)
    lemmatizer = Lemmatizer()

    # stemmer = Stemmer

    for x in actual:
        # print(x);
        s = lemmatizer.lemmatize(x)
        if "#" in s and s.split("#")[0] != "":
            tokenized.append(s.split("#")[0] + "ن")
        else:
            tokenized.append(s.replace("#", ""))
    return tokenized
Esempio n. 19
0
def train_dependency_parser(train_file='resources/train.conll', test_file='resources/test.conll', model_file='langModel.mco', path_to_jar='resources/malt.jar', options_file='resources/options.xml', features_file='resources/features.xml', memory_min='-Xms7g', memory_max='-Xmx8g'):

	def read_conll(conll_file):
		trees = [DependencyGraph(item) for item in dadegan_text(conll_file).replace(' ', '_').split('\n\n') if item.strip()]
		sentences = [[node['word'] for node in tree.nodelist[1:]] for tree in trees]
		return trees, sentences

	lemmatizer, tagger = Lemmatizer(), POSTagger()

	trees, sentences = read_conll(train_file)
	tagged = tagger.batch_tag(sentences)

	train_data = train_file +'.data'
	with codecs.open(train_data, 'w', 'utf8') as output:
		for tree, sentence in zip(trees, tagged):
			for i, (node, word) in enumerate(zip(tree.nodelist[1:], sentence), start=1):
				node['tag'] = word[1]
				node['lemma'] = lemmatizer.lemmatize(node['word'].replace('_', ' '), node['tag'])
				print(i, node['word'].replace(' ', '_'), node['lemma'].replace(' ', '_'), node['tag'], node['tag'], '_', node['head'], node['rel'], '_', '_', sep='\t', file=output)
			print(file=output)

	cmd = ['java', memory_min, memory_max, '-jar', path_to_jar, '-w', 'resources', '-c', model_file, '-i', train_data, '-f', options_file, '-F', features_file, '-m', 'learn']
	process = subprocess.Popen(cmd)
	process.wait()

	# evaluation
	print('\nEvaluating trained model on test data:')
	parser = DependencyParser(tagger=tagger, model_file=model_file)

	trees, sentences = read_conll(test_file)
	tagged = tagger.batch_tag(sentences)
	parsed = parser.tagged_batch_parse(tagged)

	test_data, test_results = test_file +'.data', test_file +'.results'
	print('\n'.join([sentence.to_conll(10) for sentence in trees]).strip(), file=codecs.open(test_data, 'w', 'utf8'))
	print('\n'.join([sentence.to_conll(10) for sentence in parsed]).strip(), file=codecs.open(test_results, 'w', 'utf8'))

	cmd = ['java', '-jar', 'resources/MaltEval.jar', '-g', test_data, '-s', test_results]
	process = subprocess.Popen(cmd)
	process.wait()
Esempio n. 20
0
def pipeline_sentence(sentence, model, tokenizer):
    sentence = change_words(sentence)

    normalizer = Normalizer()
    sentence = normalizer.normalize(sentence)
    sentence_lem = ' '.join([
        Lemmatizer().lemmatize(x)
        for x in word_tokenize(normalizer.normalize(sentence))
    ])
    nlp = pipeline("ner", model=model, tokenizer=tokenizer)
    sentence_ner = nlp(sentence)
    sentence_ner_lem = nlp(sentence_lem)
    return sentence_ner, sentence_ner_lem, sentence_lem, sentence
Esempio n. 21
0
class HazmTokenizer(Component):
    defaults = {"stemmer": True, "lemmatizer": True, 'pos': False}

    def __init__(self, component_config: Dict[Text, Any] = None) -> None:

        super().__init__(component_config)
        if self.component_config.stemmer:
            self._stemmer = Stemmer()

        if self.component_config.lemmatizer:
            self._lemmatizer = Lemmatizer()

        if self.component_config.pos:
            self._pos_tagger = POSTagger(model='resources/postagger.model')

    def required_packages(self) -> List[Text]:
        return ['hazm']

    def process(self, message: Message, **kwargs: Any) -> None:
        text = message.text
        for sentence_str in sent_tokenize(text):
            sentence = Sentence(sentence_str)
            tokens = word_tokenize(sentence_str)
            pos_tags = []
            if self.component_config.pos:
                pos_tags = self._pos_tagger.tag(tokens)
            for idx, token_str in enumerate(tokens):
                token = Token(text=token_str)
                if self.component_config.stemmer:
                    token[TOKEN_ATTRIBUTE_STEM] = self._stemmer.stem(token_str)
                if self.component_config.lemmatizer:
                    token[TOKEN_ATTRIBUTE_LEMM] = self._lemmatizer.lemmatize(
                        token_str)
                if self.component_config.pos:
                    token[TOKEN_ATTRIBUTE_POS] = pos_tags[idx][1]
                sentence.add_token(token)
            message.add_sentence(sentence)
Esempio n. 22
0
class POS():
    def __init__(self, inFile, outFile):
        self.inFile = inFile
        self.outFile = outFile
        self.normalizer = Normalizer()
        self.tagger = POSTagger(model='resources/postagger.model')
        self.lemmatizer = Lemmatizer()
        self.stemmer = Stemmer()

    def posTaggerTXT(self):
        with open(self.outFile, 'w', encoding="utf8") as o:
            with open(self.inFile, 'r', encoding="utf8") as f:
                line = f.readline()
                while line:
                    line = line.strip()
                    line = self.normalizer.normalize(line)
                    tags = self.tagger.tag(word_tokenize(line))
                    for li in tags:
                        t = '{:20s} {:20s} {:20s} {:20s}\n'.format(
                            li[0], self.nameTag(li[1]),
                            self.lemmatizer.lemmatize(li[0]),
                            self.stemmer.stem(li[0]))
                        o.write(t)
                    line = f.readline()

    def posTaggerHTML(self):
        with open(self.outFile, 'w', encoding="utf8") as o:
            with open(self.inFile, 'r', encoding="utf8") as f:
                o.write(self.preHTML())
                line = f.readline()
                while line:
                    line = line.strip()
                    line = self.normalizer.normalize(line)
                    tags = self.tagger.tag(word_tokenize(line))
                    for li in tags:
                        t = '{:s} -//- {:s} -//- {:s} -//- {:s}\n'.format(
                            li[0], self.nameTag(li[1]),
                            self.lemmatizer.lemmatize(li[0]),
                            self.stemmer.stem(li[0]))
                        o.write(self.divHTML(self.colorTag(li[1]), t))
                        o.write("\n")
                    line = f.readline()
                o.write(self.posHTML())

    def nameTag(self, tag):
        if tag == "V":
            return "فعل"
        elif tag == "N":
            return "اسم"
        elif tag == "ADV":
            return "قید"
        elif tag == "PRO":
            return "ضمیر"
        elif tag == "PUNC":
            return "نشانه نگارشی"
        elif tag == "Ne":
            return "غیر قابل تشخیص"
        elif tag == "NUM":
            return "عدد"
        elif tag == "CONJ":
            return "حرف ربط"
        elif tag == "POSTP":
            return "نشانه مفعولی"
        elif tag == "P":
            return "حرف اضافه"
        elif tag == "AJ":
            return "صفت"
        elif tag == "DET":
            return "ضمیر اشاره"
        else:
            return tag

    def colorTag(self, tag):
        if tag == "V":
            return "red"
        elif tag == "N":
            return "hotpink"
        elif tag == "ADV":
            return "blue"
        elif tag == "PRO":
            return "gold"
        elif tag == "PUNC":
            return "lightblue"
        elif tag == "Ne":
            return "darkgray"
        elif tag == "NUM":
            return "white"
        elif tag == "CONJ":
            return "lightgreen"
        elif tag == "POSTP":
            return "white"
        elif tag == "P":
            return "aqua"
        elif tag == "AJ":
            return "teal"
        elif tag == "DET":
            return "slateblue"
        else:
            return "white"

    def preHTML(self):
        return """<!DOCTYPE html>
<head>
    <meta charset="UTF-8">
</head>
<body>
"""

    def posHTML(self):
        return """
    </body>
</html>"""

    def divHTML(self, color, text):
        return """
        <div style="background-color:""" + color + """">
        """ + """<h4>""" + text + """</h4>
        """ + """</div>
Esempio n. 23
0
def Evaluate_lemmatizer(inputs, labels, lib='hazm'):
    predicted_labels_with_pos = []
    predicted_labels_no_pos = []

    if lib == 'hazm':
        lemmatizer = Lemmatizer()
        for sentence in inputs:
            sent_labels_with_pos = []
            sent_labels_no_pos = []

            for (word, pos) in sentence:
                if pos == 'ADJ':
                    pos = 'AJ'
                sent_labels_with_pos.append(lemmatizer.lemmatize(word, pos))
                sent_labels_no_pos.append(lemmatizer.lemmatize(word))

            predicted_labels_with_pos.append(sent_labels_with_pos)
            predicted_labels_no_pos.append(sent_labels_no_pos)

    elif lib == 'parsivar':
        stemmer = FindStems()
        for sentence in inputs:
            sent_labels_with_pos = []
            sent_labels_no_pos = []

            for (word, pos) in sentence:
                sent_labels_with_pos.append(stemmer.convert_to_stem(word, pos))
                sent_labels_no_pos.append(stemmer.convert_to_stem(word))

            for i in range(len(sentence)):
                if sentence[i][1] == 'V':
                    sent_labels_with_pos[i] = re.sub(r"&", r"#",
                                                     sent_labels_with_pos[i])
                    sent_labels_no_pos[i] = re.sub(r"&", r"#",
                                                   sent_labels_no_pos[i])

            predicted_labels_with_pos.append(sent_labels_with_pos)
            predicted_labels_no_pos.append(sent_labels_no_pos)

    precisions_with_pos = []
    precisions_no_pos = []
    all_truly_labeled_with_pos = []

    for i in range(len(labels)):
        truly_labeled_with_pos = [
            predicted_labels_with_pos[i][j] == labels[i][j]
            for j in range(len(labels[i]))
        ]
        all_truly_labeled_with_pos.append(truly_labeled_with_pos)
        num_truly_labeled_with_pos = sum(truly_labeled_with_pos)
        truly_labeled_no_pos = [
            predicted_labels_no_pos[i][j] == labels[i][j]
            for j in range(len(labels[i]))
        ]
        num_truly_labeled_no_pos = sum(truly_labeled_no_pos)

        precision_with_pos = num_truly_labeled_with_pos / len(labels[i])
        precision_no_pos = num_truly_labeled_no_pos / len(labels[i])
        precisions_with_pos.append(precision_with_pos)
        precisions_no_pos.append(precision_no_pos)

    per_pos = {}
    detailed_analyze = {}
    for i in range(len(inputs)):
        for j in range(len(inputs[i])):

            if inputs[i][j][1] not in per_pos.keys():
                per_pos[inputs[i][j][1]] = {'true': 0, 'false': 0}

            if all_truly_labeled_with_pos[i][j]:
                per_pos[inputs[i][j][1]]['true'] += 1
            else:
                per_pos[inputs[i][j][1]]['false'] += 1

            if inputs[i][j][1] not in detailed_analyze.keys():
                detailed_analyze[inputs[i][j][1]] = {'true': [], 'false': []}

            # detailed_analyze[inputs[i][j][1]]['gold'].append(labels[i][j])
            if all_truly_labeled_with_pos[i][j]:
                detailed_analyze[inputs[i][j][1]]['true'].append(
                    inputs[i][j][0])
                # detailed_analyze[inputs[i][j][1]]['false'].append('NONE')
            else:
                detailed_analyze[inputs[i][j][1]]['false'].append(
                    inputs[i][j][0])
                # detailed_analyze[inputs[i][j][1]]['true'].append('NONE')

    accuracy_per_pos = {
        k: v['true'] / (v['true'] + v['false'])
        for k, v in per_pos.items()
    }
    for k, v in detailed_analyze.items():
        v['true'] = set(v['true'])
        v['false'] = set(v['false'])
    precision_with_pos = sum(precisions_with_pos) / len(precisions_with_pos)
    precision_no_pos = sum(precisions_no_pos) / len(precisions_no_pos)
    return precision_with_pos, precision_no_pos, accuracy_per_pos, detailed_analyze
Esempio n. 24
0
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from hazm import Lemmatizer, Normalizer, word_tokenize

if __name__ == "__main__":
    data = [
        "من به یادگیری ماشین بسیار علاقه‌مند هستم",
        "من عاشق کد‌نویسی با پایتون هستم",
        "من عاشق ساختن نرم‌افزارهای هوشمند هستم",
        "هوشمند‌سازی یک نرم‌افزار فرآیندی بسیار پیچیده است"
    ]

    normalizer = Normalizer()
    lemmatizer = Lemmatizer()

    data = [normalizer.normalize(_d) for i, _d in enumerate(data)]
    lemmatizer = Lemmatizer()
    tagged_data = [
        TaggedDocument(words=[
            lemmatizer.lemmatize(_d) for i, _d in enumerate(
                word_tokenize(normalizer.normalize(_d.lower())))
        ],
                       tags=[str(i)]) for i, _d in enumerate(data)
    ]

    vec_size = 100
    alpha = 0.025

    model = Doc2Vec(vec_size=vec_size,
Esempio n. 25
0
from hazm import Normalizer
from hazm import Lemmatizer
from hazm import word_tokenize


PUNCS = ['،', '.', ',', ':', ';', '"']
NORMALIZER = Normalizer()
LEMMATIZER = Lemmatizer()

def text_cleaner(text):
    normalized = NORMALIZER.normalize(text)
    tokenized = word_tokenizer(normalized)
    tokens = []
    for t in tokenized:
        temp = t
        for p in PUNCS:
            temp = temp.replace(p, '')
        tokens.append(temp)
    tokens = [w for w in tokens if not len(w) <= 1]
    tokens = [w for w in tokens if not w.isdigit()]
    tokens = [LEMMATIZER.lemmatize(w) for w in tokens]

    return ' '.join(tokens)


def word_tokenizer(text):
    return word_tokenize(text)
Esempio n. 26
0
	def __init__(self):
		self.lemmatizer = Lemmatizer()
Esempio n. 27
0
# from parsivar import FindStems
# my_stemmer = FindStems()
# print(my_stemmer.convert_to_stem('درافتادن'))

# from hazm import Stemmer
# stemmer = Stemmer()
# print(stemmer.stem('کتاب‌ها'))

from hazm import Lemmatizer
lemmatizer = Lemmatizer()
print(lemmatizer.lemmatize("آبادگری"))
Esempio n. 28
0
def lemmatizer(tweets):
    lemmatizer_tweets = []
    for tweet in tweets:
        lemmatizer_tweets.append(Lemmatizer().lemmatize(tweet))

    return lemmatizer_tweets
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
warnings.simplefilter(action='ignore', category=FutureWarning)

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from hazm import Lemmatizer, Normalizer, word_tokenize
from pyravendb.store import document_store
from DocumentObjects import Document
import pandas as pd
import json

if __name__ == "__main__":
    # data = pd.read_excel("dataset.xlsx")
    # data = list(data["description"].astype(str).values.flatten())

    normalizer = Normalizer()
    lemmatizer = Lemmatizer()

    model = Doc2Vec.load("BasicModel")
    test_str = "بازکردن فلنج ها جهت نصب صفحات مسدود کننده"
    test_str = normalizer.normalize(test_str)

    test_data = [lemmatizer.lemmatize(_d) for i,  _d in enumerate(word_tokenize(test_str))]
    result = model.docvecs.most_similar([model.infer_vector(test_data)], topn=10)

    store = document_store.DocumentStore(urls=["http://localhost:8080"], database="SeSimi")
    store.initialize()
    with store.open_session() as session:
        print()
        [print(list(session.query(collection_name='Documents').where(key=result[i][0]))[0].title) for i in range(0, len(result))]
Esempio n. 30
0
stopwords_f = open('stop_words.txt', 'r', encoding='utf-8')
stopwords = stopwords_f.readlines()
for i in range(len(stopwords)):
    stopwords[i] = stopwords[i].replace("\n", "")
samewords_f = open('same_words.txt', 'r', encoding='utf-8')
samewords = samewords_f.readlines()
#samewords_tokens = word_tokenize(samewords_f.read(),"\n")
for i in range(len(samewords)):
    samewords[i] = samewords[i].replace("\n", "")
    samewords[i] = word_tokenize(samewords[i])
#print('same=' + str(samewords))
samewords_f.close()
stopwords_f.close()
#print('stop='+str(stopwords))

lemmatizer = Lemmatizer()
normalizer = Normalizer()
#print(query_process("ما تو را کودک،. کتابهای به برای دوست داریم خودرو را هنوز اتومبیل"))


@app.route('/api/dataframe', methods=['GET'])
def df():
    return j

def find_in_dictionary(word,dictionary1):
    if word in dictionary1:
        return dictionary1[word].copy()#returns a list of docIDs with where the term is
    else:
        return []

Esempio n. 31
0
class Preprocessor:
    normalizer = Normalizer()
    stemmer = Stemmer()
    lemmatizer = Lemmatizer()
    tokenizer = WordTokenizer()
    stop_words = stopwords_list()

    @staticmethod
    def remove_noise(text: str) -> str:
        return Preprocessor.__remove_punctuation(
            Preprocessor.__remove_emojis(text))

    @staticmethod
    def remove_stop_words(tokens: List) -> str:
        return [t for t in tokens if t not in Preprocessor.stop_words]

    @staticmethod
    def __remove_emojis(text: str):

        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u'\U00010000-\U0010ffff'
            u"\u200d"
            u"\u2640-\u2642"
            u"\u2600-\u2B55"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\u3030"
            u"\ufe0f"
            "]+",
            flags=re.UNICODE)

        first_cleaned_text = emoji_pattern.sub(r'', text)  # no emoji
        return emoji.get_emoji_regexp().sub(r'', first_cleaned_text)

    @staticmethod
    def __remove_punctuation(text: str):
        try:
            return re.sub(
                r'[\.\?\!\,\:\;\،\(\)\؛\#\%\^\&\$\~\'\"\×\-\_\*\>\<\+\=\\\/]',
                '', text)
        except TypeError as e:
            print(e, text)

    @staticmethod
    def normalize(text: str) -> str:
        return Preprocessor.normalizer.normalize(text)

    @staticmethod
    def stem(word: str) -> str:
        return Preprocessor.stemmer.stem(word)

    @staticmethod
    def lemmatize(word: str) -> str:
        return Preprocessor.lemmatizer.lemmatize(word)

    @staticmethod
    def tokenize(text: str) -> str:
        return Preprocessor.tokenizer.tokenize(text)

    @staticmethod
    def preprocess(text: str) -> str:
        cleaned_text = Preprocessor.remove_noise(str(text))
        normalized_text = Preprocessor.normalize(cleaned_text)
        tokens = Preprocessor.tokenize(normalized_text)
        none_stop_words = Preprocessor.remove_stop_words(tokens)
        # stems = [Preprocessor.stem(w) for w in tokens]
        lemmatized = [Preprocessor.lemmatize(w) for w in none_stop_words]
        return ' '.join(lemmatized)