Example #1
0
def clean_data():
    correct_base_path = "data/benchmark/fixed/correct/"
    wrong1_base_path = "data/benchmark/fixed/wrong/"
    dist_base_path = "data/benchmark/test/"
    files_name = os.listdir(correct_base_path)
    c = 0
    for file_name in files_name:
        correct_file_lines = open(correct_base_path + file_name, 'r').readlines()
        wrong_file_lines = open(wrong1_base_path + file_name, 'r').readlines()
        if len(correct_file_lines) != len(wrong_file_lines):
            print("this files is not matche: " + file_name)
            continue
        correct_name = dist_base_path + "c_" + file_name
        wrong_name = dist_base_path + "w_" + file_name
        correct_file = open(correct_name, 'w')
        wrong_file = open(wrong_name, 'w')
        for i in range(0, len(correct_file_lines)):
            correct_line = word_tokenize(normalize(correct_file_lines[i]))
            wrong_line = word_tokenize(normalize(wrong_file_lines[i]))
            correct_match, wrong_match = lcs(correct_line, wrong_line)
            correct_str = ""
            for x in correct_match:
                correct_str += x + " "
            wrong_str = ""
            for x in wrong_match:
                wrong_str += x + " "
            correct_str = correct_str.replace("_", "")
            wrong_str = wrong_str.replace("_", "")
            correct_file.write(correct_str + "\n")
            wrong_file.write(wrong_str + "\n")
Example #2
0
    def reader(self):

        self.data = []

        words = {}
        for indx, record in enumerate(open(self.path_dataset, 'r')):

            record = json.loads(record)

            record["category"] = record["category"].split("-")[0].strip()

            record = self.pre_processor(record)

            self.data.append(record)

            for word in word_tokenize(record['body']):

                if word in words:

                    words[word] += 1

                else:

                    words[word] = 1

            if indx != 0 and indx % 100 == 0:

                print(indx)

        for indx, record in enumerate(self.data):

            if len(word_tokenize(record['body'])) <= 512:

                continue

            count = {}

            for word in word_tokenize(record['body']):

                count[word] = words[word]

            valid_word = [
                item[0] for item in sorted(
                    words.items(), key=lambda kv: kv[1], reverse=True)[:512]
            ]

            record['body'] = " ".join(
                word for indx, word in enumerate(word_tokenize(record['body']))
                if word in valid_word and indx <= 512).strip()

            if indx % 100 == 0:

                print("*", indx)

        # plt.hist(x=words.values(),
        #          bins=40)
        # plt.show()

        return self.data
Example #3
0
def export_time(question, tokens, labels):
    labels = np.array(labels)
    b_time = np.where(labels == "B_TIM")[0]
    i_time = np.where(labels == "I_TIM")[0]
    url = None
    n = len(b_time)
    if n == 0:
        st_arr = word_tokenize(question)
        t_ = export_time_single(st_arr, question)

        if t_ == None:
            res, url, adhan_names = adhan_handler(
                None, tokens, labels, question)
            if res != None:
                return res, True, url, adhan_names

        return [t_], False, None, None

    elif n >= 2:
        t_ = []
        time_texts = []
        for i in range(n):
            st_arr = []
            if i < n - 1:
                ida = i_time[np.where(
                    (i_time > b_time[i]) & (i_time < b_time[i + 1]))]
            else:
                ida = i_time[np.where(i_time > b_time[i])]
            for t in np.r_[b_time[i], ida]:
                st_arr.append(tokens[int(t)])
            time_texts.append(" ".join(st_arr))
            t_.append(export_time_single(st_arr, force_return=True))
        is_adhan_needed = False
        new_t = copy(t_)
        for i, t in enumerate(t_):
            if t_[i] == None:
                new_t[i] = time_texts[i]
                is_adhan_needed = True
        if is_adhan_needed:
            res, url, adhan_names = adhan_handler(
                new_t, tokens, labels, question)
            if res != None:
                if not None in res:
                    return res, True, url, adhan_names
        return t_, False, None, None
    else:
        st_arr = []
        for t in np.r_[b_time, i_time]:
            st_arr.append(tokens[int(t)])
        t_ = export_time_single(st_arr, force_return=True)
        if t_ == None:
            t_ = export_time_single(word_tokenize(question), question)
        if t_ == None:
            res, url, adhan_names = adhan_handler(
                None, tokens, labels, question)
            if res != None:
                return res, True, url, adhan_names
        return [t_], False, None, None
Example #4
0
def find_max(folder_path):
    files_name = glob.glob(folder_path + '*.csv')
    max_len = 0
    xxx = 0
    for file in files_name:
        dataset = pandas.read_csv(file)
        for i, item in enumerate(list(dataset.tweet)):
            if len(hazm.word_tokenize(item)) >= max_len:
                max_len = len(hazm.word_tokenize(item))
    print(max_len)
def load_data(file_name='./dataset/fa_2.xlsx'):
    data, labels = prepare_data(pd.read_excel(file_name))
    unique_labels = np.unique(labels)
    data_new = list([bag_of_words(word_tokenize(d)) for d in data])
    lfeats = dict()
    for label in unique_labels:
        idx = np.where(labels == label)[0]
        data_c = data[idx]
        lfeats[label] = list([bag_of_words(word_tokenize(d)) for d in data_c])
    return lfeats
Example #6
0
def hazmtoalpheios(word,uri):
    wordslist = etree.Element("words")
    normalizer = Normalizer()
    data = normalizer.normalize(word)
    sentences = sent_tokenize(data)
    words = []
    for sentence in sentences:
        if words:
            words = words.append(word_tokenize(sentence))
        else:
            words = word_tokenize(sentence)
    analyses = []
    for item in words:
        stemmer = Stemmer()
        wordstem = stemmer.stem(item)
        lemmatizer = Lemmatizer()
        wordlema = lemmatizer.lemmatize(item)
        if '#' in wordlema:
            worldleam, garbage = wordlema.split("#")
        tagger = POSTagger(model=os.path.join(model_path,"postagger.model"))
        wordtagged = tagger.tag(item)
        wordpofs = wordtagged[0][1]
        wordpofs = maptohazm(wordpofs)
        # a better way to do this would be to create a Python class
        # to formalize the abstraction
        analysis = {}
        analysis['engine'] = 'hazm'
        analysis['uri'] = uri
        analysis['form'] = {}
        analysis['form']['text'] = item
        analysis['form']['lang'] = 'per'
        analysis['entries'] = []
        entry = {}
        entry['dict'] = {}
        entry['dict']['hdwd'] = {}
        entry['dict']['hdwd']['lang'] = 'per'
        entry['dict']['hdwd']['text'] = wordstem
        entry['infls'] = []
        infl = {}
        infl['stem'] = {} 
        infl['stem']['text'] = wordstem
        infl['stem']['lang'] = 'per'
        infl['pofs'] = {}
        if wordpofs:
            infl['pofs']['order'] = str(wordpofs[1])
            infl['pofs']['text'] = wordpofs[0]
        entry['infls'].append(infl)
        analysis['entries'].append(entry)
        analyses.append(analysis)
    return analyses
Example #7
0
def stem_data(dat):
    normalizer = hazm.Normalizer()
    dat = normalizer.normalize(dat)
    sent = hazm.sent_tokenize(dat)

    words = []

    for s in sent:
        tagged = list(tagger.tag(hazm.word_tokenize(s)))
        new_tag = list(tagged)

        for token in tagged:
            if token[0] in stop_words:
                new_tag.remove(token)

        lemmatizer = hazm.Lemmatizer()
        for token in new_tag:

            stemmed = lemmatizer.lemmatize(token[0], pos=token[1])
            stemmer = hazm.Stemmer()
            stemmed = stemmer.stem(stemmed)
            if len(stemmed) > 0 and ('#' not in stemmed):
                words.append(stemmed)

    return words
Example #8
0
def countTextWords(text):
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)
    tokens = hazm.word_tokenize(text)
    stemmer = hazm.Stemmer()
    words = [stemmer.stem(token) for token in tokens]
    return len(words)
Example #9
0
 def stremme(val):
     Log.logger.info('Data stemme by hazm package ')
     # words = [[stemmer.stem(word) for word in word_tokenize(sentence)] for sentence in sent_tokenize(val)]
     words = [[ps.run(word) for word in word_tokenize(sentence)] for sentence in sent_tokenize(val)]
     words = words[0]
     val = ' '.join(words)
     return val
Example #10
0
def draw_cloud(cleantweets, image_path, show_image=False):
    text = " ".join(str(tweet) for tweet in cleantweets)
    text = get_display(arabic_reshaper.reshape(text))
    tokens = word_tokenize(text)
    dic = Counter(tokens)

    if verbose:
        print(dic.most_common(max_words))

    twitter_mask = np.array(Image.open(f'twitter-logo-q{export_quality}.png'))
    font_path = select_a_font()
    wordcloud = WordCloud(font_path=font_path,
                          max_words=max_words,
                          margin=0,
                          width=5000,
                          height=5000,
                          min_font_size=4,
                          max_font_size=700,
                          background_color="white",
                          mask=twitter_mask)
    wordcloud.generate_from_frequencies(dic)

    image = wordcloud.to_image()
    wordcloud.to_file(image_path)

    if show_image:
        image.show()

    print(f"Generated image {image_path}")
Example #11
0
def query_process(query):
    str = normalizer.normalize(query)
    str = str.translate(str.maketrans('_|ẖ–;،"…=$&@*-/:<>!+.()«»٪؟', '                           ',
                                                   '\u200c\u202c\u200f\u200e\u2069\u2067\u200b\u200d'))
    words = word_tokenize(str)
    words = list(dict.fromkeys(words))
    i = 0
    while i < len(words):
        while True:
            if i >= len(words): break
            repeat = False
            word = words[i]
            lem_word = lemmatizer.lemmatize(word).split('#')[0]
            if lem_word == '':
                lem_word = 'است'
            if word in stopwords or lem_word in stopwords:
                words.remove(word)
                repeat = True
            if repeat == False:
                break

        for t in range(len(samewords)):
            if lem_word in samewords[t]:
                lem_word = samewords[t][0]
                break

        words[i] = lem_word
        i = i + 1
    return words
Example #12
0
    def process_text(self, text: str) -> Dict[str, int]:
        """
        Splits a long text into words.
        If `persian_normalize` attribute has been set to True, normalizes `text` with Hazm Normalizer.
        If `include_numbers` attribute has been set to False, removes all Persian, English and Arabic numbers from
        text`.
        :param text: The text we want to process
        :return: a dictionary. keys are words and values are the frequencies.
        """
        flags = (
            re.UNICODE if version < '3' and type(text) is unicode  # noqa: F821
            else 0)

        if self.persian_normalize:
            normalizer = Normalizer()
            text = normalizer.normalize(text)
        if not self.include_numbers:
            text = re.sub(r"[0-9\u06F0-\u06F9\u0660-\u0669]", "", text)

        if self.regexp:
            words = re.findall(self.regexp, text, flags)
        else:
            words = word_tokenize(text)

        if self.collocations:
            word_counts = unigrams_and_bigrams(words, self.normalize_plurals)
        else:
            word_counts, _ = process_tokens(words, self.normalize_plurals)

        return word_counts
Example #13
0
def most_freq_words():
    title, text, ids = extract_data_as_string()

    listToStr = ' '.join(map(str, title + text))
    listToStr = word_tokenize(listToStr)
    word_count = nltk.FreqDist(listToStr)
    return word_count.most_common(30)
Example #14
0
    def pre_processor(self, record):

        # remove url

        record['body'] = re.sub(r'^https?:\/\/.*[\r\n]*',
                                '',
                                record['body'],
                                flags=re.MULTILINE)

        record['body'] = re.sub(r'^www?:\/\/.*[\r\n]*',
                                '',
                                record['body'],
                                flags=re.MULTILINE)

        record['body'] = " ".join(
            word for word in record['body'].split()
            if not word.endswith(".ir") and not word.endswith(".com") and
            not word.endswith(".org") and not word.startswith("www.")).strip()

        record['body'] = re.sub(r'\s+\d+\s+', ' ', record['body'])

        # removeing stop words
        record['body'] = " ".join(word for word in word_tokenize(
            self.normalizer.normalize(record['body']))
                                  if word not in self.stopwords).strip()

        record['body'] = re.sub('\u00a0', ' ', record['body'])

        record['body'] = re.sub(r'\([^)]*\)', '', record['body']).\
            strip().strip("انتهای پیام")

        return record
Example #15
0
def location_handler(question, tokens, labels, check_validation=True):
    loc = location_(question, tokens, labels)
    if loc:
        if check_validation:
            problem_list = []
            for i, l in enumerate(loc):
                l_inf = get_city_info(l)
                problem = False
                if not l_inf:
                    if l in ["تهرون", "ترون"]:
                        loc[i] = "تهران"
                    elif l in ["گم"]:
                        loc[i] = "قم"
                    elif l in ["اصفان", "اصفون"]:
                        loc[i] = "اصفهان"
                    else:
                        problem = True
                problem_list.append([i, problem])
            w_t = np.array(hazm.word_tokenize(question))
            bloc = np.where(labels == "B_LOC")[0] - 1
            iloc = np.where(labels == "I_LOC")[0] - 1
            if len(bloc) >= len(problem_list):
                for i in range(len(problem_list)):
                    if problem_list[i][1]:
                        if i != len(problem_list) - 1:
                            il = iloc[(iloc > bloc[i]) & (iloc < bloc[i+1])]
                        else:
                            il = iloc[iloc > bloc[i]]
                        loc[problem_list[i][0]] = location_fix(
                            question, [" ".join(w_t[np.r_[bloc[i], il]])])[0]
            else:
                loc = [USER_CITY]
    return loc
Example #16
0
def tokenize(corpus, lemma=True, punctuation=True, space_to_space=False):

    if (not punctuation):
        # table = str.maketrans({key: None for key in string.punctuation})
        # corpus = corpus.translate(table)
        corpus = corpus.replace(',', ' ')
        corpus = corpus.replace("\u220c", "")
        corpus = corpus.replace('(', ' ')
        corpus = corpus.replace(')', ' ')
        corpus = corpus.replace('.', ' ')
        corpus = corpus.replace("،", " ")
        corpus = corpus.replace("«", " ")
        corpus = corpus.replace("»", " ")

    if (space_to_space):
        tokenized = corpus.split(' ')
    else:
        tokenized = word_tokenize(corpus)

    if (lemma):
        lemmatizer = Lemmatizer()
        for i in range(len(tokenized)):
            tokenized[i] = lemmatizer.lemmatize(tokenized[i]).split('#')[0]

    return tokenized
Example #17
0
def test(train_results, classes, test_case=False):
    if test_case:
        file = open("../TestCase/test.txt", 'r', encoding='utf-8')
        output = open("../TestCase/my-output.txt", 'w+', encoding='utf-8')
        output_2 = open("../../ClsModel/NaiveBayes/TestCase.output.txt",
                        'w+',
                        encoding='utf-8')
        classes = ["c1", "c2"]
    else:
        file = open("../test.txt", 'r', encoding='utf-8')
        output = open("../output.txt", 'w+', encoding='utf-8')
        output_2 = open("../../ClsModel/NaiveBayes/Test.output.txt",
                        'w+',
                        encoding='utf-8')
    for line in file.readlines():
        tag_sentence = hazm.word_tokenize(line)
        sentence = tag_sentence[1:]
        for c in classes:
            output.write(c + " ")
            output_2.write(c + " ")
            p = 0
            for word in sentence:
                if word in train_results[c][1]:
                    p += math.log10(train_results[c][1][word])
                else:
                    p += math.log10(train_results[c][1]['<Unk>'])
            p += math.log10(train_results[c][0])
            output.write(str(p) + " ")
            output_2.write(str(p) + " ")
        output.write("\n")
        output_2.write("\n")
Example #18
0
    def create_user_files(self):
        evaluation_users = pandas.read_csv(self.evaluation_user_path)

        for user_index, user in enumerate(evaluation_users.user):
            user_tweets = []
            user_csv_path = self.crawled_data_path + user + '.csv'
            user_csv = pandas.read_csv(user_csv_path)
            user_csv["length"] = user_csv["reply_to"].apply(
                lambda x: len(ast.literal_eval(x)) == 0)
            user_csv_noreply = user_csv.loc[user_csv["length"], :].drop(
                ["length"], axis=1)

            for i, item in enumerate(user_csv_noreply.tweet):
                item = str(item)
                url = re.findall(r"http\S+", item)
                if (url == []):
                    item = self.new_normalizer.Normalizer_text(item)
                    if len(hazm.word_tokenize(item)) >= 5:
                        user_tweets.append(item)

                if i % 1000 == 0:
                    print(
                        f'{(i/len(user_csv_noreply))*100 :.2f} done {user} {user_index}'
                    )

            user_csv_noreply_normed = pandas.DataFrame({'tweet': user_tweets})
            user_csv_noreply_normed.to_csv(user_csv_path.replace(
                'crawled_data', 'evaluation_user_data_big'),
                                           index=False)
Example #19
0
    def score(self, sentences):
        # Predict
        pos, neg, neu = 0, 0, 0
        stemmer = Stemmer()
        classifier = self.__get_model()
        normalizer = Normalizer()

        sentences = sent_tokenize(sentences)

        for sentence in sentences:
            sentence = normalizer.normalize(sentence)
            words = word_tokenize(sentence)

            for word in words:
                stemmer.stem(word)
                class_result = classifier.classify(self.__word_feats(word))
                if class_result == 'neg':
                    neg = neg + 1
                if class_result == 'pos':
                    pos = pos + 1
                if class_result == 'neu':
                    neu = neu + 1

        positive_sentiment = str(float(pos) / len(words))
        # print('Positive: ' + positive_sentiment)
        neutral_sentiment = str(float(neu) / len(words))
        # print('Neutral: ' + neutral_sentiment)
        negative_sentiment = str(-float(neg) / len(words))
        # print('Negative: ' + negative_sentiment)

        total_sentiment = (float(positive_sentiment)+float(negative_sentiment)) / 2
        # print('Total (Avg): ' + str(total_sentiment))

        return total_sentiment
Example #20
0
def compute_test_perplexity(n, data_path, tokenized):
    with open(data_path, "r", encoding="utf-8") as test:
        test = test.read()

    test_tokenized = word_tokenize(test)

    ngrams = generate_n_gram(tokenized, n)
    ngrams_minus_1 = generate_n_gram(tokenized, n - 1)

    sum_log_probs = 0
    for i in range(len(test_tokenized)):

        found = False
        for ngram in ngrams:
            if ngram[0] == test_tokenized[i - n:i]:
                count_ngram = ngram[1]
                found = True

        for ngram_1 in ngrams_minus_1:
            if ngram_1[0] == test_tokenized[i - n:i - 1]:
                prob = count_ngram / ngram_1[1]
                sum_log_probs = sum_log_probs + math.log(prob, 2)

        if (not found):
            prob = 1

        sum_log_probs = sum_log_probs + math.log(prob, 2)

    perplexity_by_log = 2**(-1.0 * sum_log_probs / len(test_tokenized))

    # print (perplexity_by_log)

    return
Example #21
0
def home():

    if request.method == 'POST':
        inputText = request.form['text']
        nltk_stopwords = get('stopwords')
        # stemmer = Stemmer()
        title_body_tokenized = word_tokenize(inputText)
        title_body_tokenized_filtered = [
            w for w in title_body_tokenized if not w in nltk_stopwords
        ]
        # title_body_tokenized_filtered_stemming =  [stemmer.stem(w) for w in title_body_tokenized_filtered]
        # print(title_body_tokenized_filtered_stemming)

        vectorizer = get('vectorizer')
        title_body_tokenized_filtered_stemming_vectorized = vectorizer.transform(
            title_body_tokenized_filtered)
        model = get('model')
        predict = model.predict(
            title_body_tokenized_filtered_stemming_vectorized)

        lables = get('lables')
        lable = lables[predict]
        return render_template('index.html',
                               lable=lable,
                               stemer=title_body_tokenized_filtered_stemming)
    else:
        return render_template('index.html')
Example #22
0
    def extract_metadata(self, tweet):
        important_words = []
        syms = []
        hashtags = []
        content_len = 0

        content = self.normalizer.normalize(tweet['content'])
        if 'های وب' in content: syms.append('های_وب')
        sentences = sent_tokenize(content)
        for sentence in sentences:
            sentence = sentence.translate(str.maketrans('', '', self.punctuations))

            words = word_tokenize(sentence)
            content_len += len(words)
            sent_syms, sent_hashs = self.get_symbols(words)
            syms += sent_syms
            hashtags += sent_hashs
            tags = self.tagger.tag(words)
            verbs = [word for (word, role) in tags if role == 'V']

            filtered_words = ([word.replace('#', '')
                               for word in words if word.replace('#', '') not in self.stop_words
                               and word.replace('#', '') not in verbs
                               and set(word.replace('#', '')).intersection(self.persian_alphabet)
                               and len(word.replace('#', '')) > 1])
            important_words += filtered_words
        syms = list(set(syms))
        hashtags = list(set(hashtags))
        bigrams = self.get_ngrams(important_words, 2)
        trigrams = self.get_ngrams(important_words, 3)
        candidate_words = hashtags + syms + important_words + bigrams + trigrams
        keywords = self.get_keywords(candidate_words, content_len)
        return keywords, syms, hashtags
Example #23
0
    def __call__(self, text):
        # preprocessing
        text = unicode(text)
        text = normalize_numbers(text)
        # text = ''.join(char for char in unicodedata.normalize('NFD', text)
        #                if unicodedata.category(char) != 'Mn')  # Strip accents
        # text = re.sub("[^ a-z'.,?!\-]", "", text)

        normalizer = hazm.Normalizer()
        text = normalizer.normalize(text)
        # tokenization
        words = hazm.word_tokenize(text)
        # tokens = pos_tag(words)  # tuples of (word, tag)

        # steps
        prons = []
        for word in words:
            if not any(letter in word for letter in self.graphemes):
                pron = [word]

            # elif word in self.homograph2features:  # Check homograph
            #     pron1, pron2, pos1 = self.homograph2features[word]
            #     if pos.startswith(pos1):
            #         pron = pron1
            #     else:
            #         pron = pron2
            elif word in self.tihu:  # lookup tihu dict
                pron = self.tihu[word]
            else: # predict for oov
                pron = self.predict(word)

            prons.extend(pron)
            prons.extend([" "])

        return prons[:-1]
Example #24
0
def prepareText(text):
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)
    tokens = hazm.word_tokenize(text)
    stemmer = hazm.Stemmer()
    words = [stemmer.stem(token) for token in tokens]
    return words
Example #25
0
def data_aug(dataset, w2c_model):
    train_aug_text = []
    train_aug_label = []

    start_time = time.time()
    for i, item in enumerate(dataset.text):
        item = str(item)
        item_label = dataset.label[i]
        train_aug_text.append(item)
        train_aug_label.append(item_label)

        item_tokenized = hazm.word_tokenize(item)
        for num_aug in range(2):
            result = data_w2v_aug(num_aug, item_tokenized, w2c_model)
            train_aug_text.append(result)
            train_aug_label.append(item_label)

        if i % 1000 == 0:
            print(f'{i} ta rafte {time.time() - start_time}')
            start_time = time.time()

    auged_dataframe = pandas.DataFrame({
        'text': train_aug_text,
        'label': train_aug_label
    })
    return auged_dataframe
Example #26
0
 def preProcessing(self, doc, level=0):
     """
     This function remove punctuations and some useless prepositions and return a list of words.
     """
     junkList = [
         ".", "-", "]", "[", "،", "؛", ":", ")", "(", "!", "؟", "»", "«",
         "ْ"
     ]
     junkWords = [
         "که", "از", "با", "برای", "با", "به", "را", "هم", "و", "در", "تا",
         "یا", "هر", "می", "بر"
     ]
     pronouns = [
         "من", "تو", "او", "ما", "شما", "ایشان", "آن‌ها", "این‌ها", "آن",
         "این", "اونجا", "آنجا", "انجا", "اینها", "آنها", "اینکه"
     ]
     for char in junkList:
         doc = doc.replace(char, " ")
     result = []
     doc = hazm.Normalizer().normalize(doc)
     doc = hazm.word_tokenize(doc)
     for word in doc:
         word.strip()
         if word not in junkWords and word not in pronouns:
             result.append(word)
     return result
Example #27
0
    def remove_stop_words(val):
        Log.logger.info('Stop words removed')
        stops = Constant.STOP_WORDS
        words = [[word for word in word_tokenize(sentence) if word not in stops] for sentence in sent_tokenize(val)]
        words = words[0]

        val = ' '.join(words)
        return val
Example #28
0
def get_names(text):
    tagged_words = tagger.tag(word_tokenize(text))
    words = set(filter(
        lambda word: is_word_ok(word),
        [tagged_word[0] for tagged_word in filter(lambda tagged_word: tagged_word[1] == 'N', tagged_words)]
    ))

    return words.union(get_hash_tags(text))
Example #29
0
def tokenize_text(text):
    text = text.replace('.', ' ')
    text = re.sub('\s+', ' ', text).strip()
    text = text.replace('\u200c', ' ').replace('\n',
                                               '').replace('\r', '').replace(
                                                   'ي', 'ی').replace('ك', 'ک')
    normalized_text = normalizer.normalize(text)
    tokens = word_tokenize(normalized_text)
    return tokens
Example #30
0
 def tokenize(self, text):
     text = self.remove_symbols(text)
     text = re.sub('\s+', ' ', text).strip()
     text = text.lower()
     text = text.replace('\u200c',
                         ' ').replace('\n', '').replace('\r', '').replace(
                             'ي', 'ی').replace('ك', 'ک')
     normalized_text = normalizer.normalize(text)
     return word_tokenize(normalized_text)
def statement_pre_processing(input_statement):
    normalizer = Normalizer()
    lemmatizer = Lemmatizer()
    input_statement = normalizer.normalize(input_statement)
    input_statement = [
        lemmatizer.lemmatize(word) for word in word_tokenize(input_statement)
        if word not in stops
    ]
    return input_statement
Example #32
0
    def bigram_cleaner(text):
        text = re.sub(Text_cleaner.persian_regex, ' ', text)
        text = re.sub('[ ]+', ' ', text)

        normalizer = Normalizer()
        text = normalizer.normalize(text)

        tokenized = word_tokenize(text)
        return tokenized
	def topics(self, model, document, dictionary=None):
		if dictionary is not None:
			self.dictionary = dictionary
		text = [w for w in word_tokenize(document) if w not in self.stopwords and len(w) > 1]
		corpus = self.dictionary.doc2bow(text)
		print 'Which LDA topic maximally describes a document?\n'
		print 'Original document: ' + document
		print 'Topic probability mixture: ' + str(model[corpus])
		print 'Maximally probable topic: topic #' + str(max(model[corpus], key=itemgetter(1))[0])
		return model[corpus]
Example #34
0
def hazmtoalpheiosfile(data,uri):
    root = etree.Element("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF")    
    oaannotation = etree.SubElement(root,'{http://www.w3.org/ns/oa#}Annotation',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':'http://services.projectbamboo.org/morphology'+uri})
    oahasbody = etree.SubElement(oaannotation, '{http://www.w3.org/ns/oa#}hasBody',)
    oahastarget = etree.SubElement(oaannotation,'{http://www.w3.org/ns/oa#}hasTarget')
    hasbodydesc = etree.SubElement(oahastarget,'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri})
    ispartof = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}isPartOf',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri})
    source = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}source',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource':uri})
    title = etree.SubElement(oaannotation, '{http://purl.org/dc/elements/1.1/}title', {'{http://www.w3.org/XML/1998/namespace}lang':'eng'})
    title.text = "Morphology of " + uri
    wordslist = etree.SubElement("words")
    normalizer = Normalizer()
    data = normalizer.normalize(data)
    sentences = sent_tokenize(data)
    words = []
    for sentence in sentences:
        if words:
            words = words.append(word_tokenize(sentence))
        else:
            words = word_tokenize(sentence)
    for item in words:
        stemmer = Stemmer()
        wordstem = stemmer.stem(item)
        lemmatizer = Lemmatizer()
        wordlema = lemmatizer.lemmatize(item)
        if '#' in wordlema:
            worldleam, garbage = wordlema.split("#")
        tagger = POSTagger(model=os.path.join(model_path,"postagger.model"))
        wordtagged = tagger.tag(item)
        wordpofs = wordtagged[0][1]
        word = etree.SubElement(wordslist,'word')
        form = etree.SubElement(word, 'form', {'{http://www.w3.org/XML/1998/namespace}lang':'per'})
        form.text = item
        entry = etree.SubElement(word, 'entry')
        infl = etree.SubElement(entry,'inlf')
        term = etree.SubElement(infl, 'term', {'{http://www.w3.org/XML/1998/namespace}lang':'per'})
        stem = etree.SubElement(term, 'stem')
        stem.text = wordstem
        pofs = etree.SubElement(infl, 'pofs')
        pofs.text = wordpofs
    return root
Example #35
0
def document(filepath):
    f = open(filepath, 'r', encoding='utf-8', errors='ignore')
    txt = f.read()
    f.close()

    txt = remove_punctuation(txt)
    
    normalizer = Normalizer()
    txt = normalizer.normalize(txt)
    
    document = word_tokenize(txt)
    
    document = [word for word in document if word not in stop_words and not word.isdigit()]
    
    return document
	def texts(self, categories={'Politics'}, limit=None):
		docs = self.hamshahri.docs()
		print 'start reading corpus...'
		count = 0
		texts = []
		for doc in docs:
			if limit is not None and count == limit:
				break
			if len(categories.intersection(set(doc["categories_en"]))) > 0:
				count += 1
				for sent in sent_tokenize(doc['text']):
					if len(sent) <= 1:
						continue
					texts.append([word for word in word_tokenize(sent) if word not in self.stopwords and len(word) > 1])
		return texts
def construct_language_model_from_tabnak_collection(dictionary):
    file = open("data/tabnakNewsCollection.json", 'r')
    normalizer = SCNormalizer()
    language_model = LanguageModel(dictionary)
    i = 0
    for line in file:
        try:
            data = json.loads(line)
            content = data['title'] + " " + data['content']
            normalized_content = normalizer.normalize(content)
            word_tokenized = word_tokenize(normalized_content)
            for word in word_tokenized:
                word = word.replace("_", PersianTools().HalfSpace)
                language_model.add(word)
            i += 1
            if i % 1000 == 0:
                print(i)
        except:
            print("error accured reading json file")
    language_model.export_to_file()
    return language_model
Example #38
0
def StringError_avg(c):
    temp_1=[]
    temp_2=[]#the distances matrix that contain average distances of clusters
    
    for i in range(len(c)):
        temp_1.append(word_tokenize(c[i]))    
        temp_2.append([])
        m=0
    while (m < len(temp_1)):#generation of distances matrix or temp_2
        
        for i in range(len(temp_1)):
            count=0
            
            for word_1 in temp_1[m]:
                for word_2 in temp_1[i]:
                    count=count+StringError(word_1, word_2)
                    
            count=count/(len(temp_1[m])*len(temp_1[i]))
            temp_2[m].append(count)
        m+=1
            
    print temp_2
Example #39
0
from __future__ import unicode_literals
import os,sys,codecs
from hazm import Normalizer,sent_tokenize, word_tokenize


reader=codecs.open(os.path.abspath(sys.argv[1]),'r',encoding='utf-8')
writer=codecs.open(os.path.abspath(sys.argv[2]),'w',encoding='utf-8')

count=1
line=reader.readline()

normalizer = Normalizer()

while line:
	if count%1000==0:
		sys.stdout.write(str(count)+'...')

	if line.strip():
		n=normalizer.normalize(line.strip())
		tok=word_tokenize(n)
		sen=u' '.join(tok).replace('_',' ').replace('  ',' ').replace('  ',' ')
		l=sen+u'\n'
		writer.write(l)
	else:
		writer.write(u'\n')

	count+=1
	line=reader.readline()
sys.stdout.write('\n')
writer.flush()
writer.close()
def k_means(doc,number_of_clusters,numbers_of_iterations):
    
    literals=["به","با","از","در","بی","برای","چون","اندر","زیر","بر","الی","جز","الا","مگر","نزد","نزدیک","پیش","روی","میان","پی","جلوی","مانند","چون","درون","فراز","درباره ","محص","خاطر","نظر","راه","مثل","توسط","خلاف","دنبال","زعم","سبب","خلال","راه","سر","عین","وقت","هنگام","بجز","همچون","همچون","زیبا","قشنگ","روشن","مشخص","بزرگ","فوقالعاده","خوب","ناراحت","کوچک","مهربان","محبوب","معتقد","خوشگل","ممنون","سبک","موقت","احمق","شلوغ","مهم","جدید","بد","دور","کامل","موافق","مقارن","اجتماعی","معین","صادق","مسخره","غمگین","سرغ","خوشحال","مناسب","کند","زشت","پارسا","قدیمی","سخت","خوش","غریبه"]
    
    doc_list=doc_normalizer(doc)
    
    l_1=[]
    
    for i in range(len(doc_list)):
        l_1.append(word_tokenize(doc_list[i]))
    
    for i in range(len(l_1)):
        for word in l_1[i]:
            if word in literals:
                del word    
    l_2=doc_stemmer(l_1)
    
    l_vectors=[]
    for i in range(len(doc_list)):
        l_vectors.append([])
    
    for i in range(len(l_2)):#converting document to equivalent vector model
        for j in range(len(l_2[i])):
            l_vectors[i].append(w(l_2,l_2[i],l_2[i][j]))
    
    
    l_centeroids=[]
    for i in range(number_of_clusters):#generating random initial centeroids
        l_centeroids.append([])
    for i in range(len(l_centeroids)):
        for j in range(len(doc_list)):
            l_centeroids[i].append(random.random())    
        
    l_cosines=[]
    for i in range(len(l_vectors)):
        l_cosines.append([])
    for i in range(len(l_vectors)):
        for j in range(len(l_centeroids)):
            l_cosines[i].append(cosine(l_vectors[i], l_centeroids[j]))
            
    #print l_cosines  
    
    l_index=[]
    for i in range(len(l_cosines)):
        l_index.append([])
    
    for i in range(len(l_cosines)):
        for j in range(len(l_cosines[i])):
            if l_cosines[i][j]==min(l_cosines[i]):
                l_index[i].append(j)
                
    #print l_index                      
            
    l_clusters_1=[]
    l_clusters_2=[]
    l_clusters_vectors=[]
    for i in range(number_of_clusters):
        l_clusters_1.append([])
        l_clusters_2.append([])
        l_clusters_vectors.append([])        
    for i in range(len(l_index)):
        for j in range(len(l_index)):
            if l_index[i][0]==l_index[j][0] and j not in l_clusters_1[l_index[i][0]]:
                l_clusters_1[l_index[i][0]].append(j)
                l_clusters_2[l_index[i][0]].append(l_2[j])
                l_clusters_vectors[l_index[i][0]].append(l_vectors[j])
                
    #print l_clusters_1
    #print l_clusters_2
    #print l_clusters_vectors
    #print centeroid_generator(l_clusters_vectors) 
    
    iteration=0
    while iteration<numbers_of_iterations:
        l_centeroids_2=centeroid_generator(l_clusters_vectors)
        
        
        l_cosines_2=[]
        for i in range(len(l_vectors)):
            l_cosines_2.append([])
        for i in range(len(l_vectors)):
            for j in range(len(l_centeroids_2)):
                if l_centeroids_2[j]:
                    l_cosines_2[i].append(cosine(l_vectors[i], l_centeroids_2[j]))
        
        #print l_cosines_2           
        l_index_2=[]
        for i in range(len(l_cosines_2)):
            l_index_2.append([])
    
        for i in range(len(l_cosines_2)):
            for j in range(len(l_cosines_2[i])):
                if l_cosines_2[i][j]==min(l_cosines_2[i]):
                    l_index_2[i].append(j)
        
        
        l_clusters_1=[]
        l_clusters_2=[]
        l_clusters_vectors=[]
        for i in range(number_of_clusters):
            l_clusters_1.append([])
            l_clusters_2.append([])
            l_clusters_vectors.append([])        
        for i in range(len(l_index_2)):
            for j in range(len(l_index_2)):
                if l_index_2[i][0]==l_index_2[j][0] and j not in l_clusters_1[l_index_2[i][0]]:
                    l_clusters_1[l_index_2[i][0]].append(j)
                    l_clusters_2[l_index_2[i][0]].append(l_2[j])
                    l_clusters_vectors[l_index_2[i][0]].append(l_vectors[j]) 
                             
        iteration+=1            
        
    print l_clusters_vectors
    print l_clusters_1
    print l_clusters_2
Example #41
0
    tmp = line.split('\n')
    mylabel.append(int(tmp[0]))
file_to_read.close()

file_to_read = open(sentence_path, 'r')
file_content = file_to_read.readlines()
file_to_read.close()

index  = 0
for line in file_content:
    tmp = line.split('\n')
    tmp = tmp[0]
    tmp = normalizer.normalize(tmp)
    #print(tmp)
    #print(sent_tokenize(tmp))
    word_tokenized = word_tokenize(tmp)
    #print(word_tokenized)
    labeledSent = TaggedDocument(words = word_tokenized, tags = [index])
    sentences.append(labeledSent)
    index += 1

num_features = 100
min_word_count = 5
context = 8
num_workers = 4
print("Training model...")
model = Doc2Vec(sentences, workers=num_workers, size = num_features, min_count = min_word_count, window = context)
print("model Trained.")

for epoch in range(num_Of_epoch):
    model.train(sentences)
def sentences(file="simple_text"):
    normalizer = Normalizer()
    for line in open(file, "r", encoding="utf-8").readlines():
        for sent in sent_tokenize(line):
            yield word_tokenize(line)
Example #43
0

hamshahri = HamshahriReader()
normalizer = Normalizer()
tagger = POSTagger()
parser = DependencyParser(tagger=tagger)
extractor = InformationExtractor()
texts = []

output = open('informations.txt', 'w')
for text in Bar(max=310000).iter(hamshahri.texts()):
	texts.append(normalizer.normalize(text))
	if len(texts) <= 1000: continue

	sentences = []
	for text in texts:
		for sentence in sent_tokenize(text):
			words = word_tokenize(sentence)
			if len(words) >= 3:
				sentences.append(words)
	texts = []

	tagged = tagger.batch_tag(sentences)
	parsed = parser.tagged_batch_parse(tagged)

	for sentence in parsed:
		# print('*', *[node['word'] for node in sentence.nodelist if node['word']], file=output)
		for information in extractor.extract(sentence):
			print(*information, sep=' - ', file=output)
		print(file=output)
  #      row_sums[row_sums.nonzero()]).sum() / len(row_sums[row_sums.nonzero()])

    #print labels
    #print confusion_matrix
    return precision


if __name__ == '__main__':
    rd = HamshahriReader(config.corpora_root)
    counter = Counter()
    docs = []
    normalizer = Normalizer()
    stemmer = Stemmer()
    for doc in rd.docs(count=config.documents_count):
        doc['text'] = normalizer.normalize(doc['text'])
        doc['words'] = [stemmer.stem(word) for word in word_tokenize(doc['text'])]
        counter.update([doc['cat']])
        docs.append(doc)

    print counter
    all_words = []
    for doc in docs:
        all_words.extend(doc['words'])

    dist = nltk.FreqDist(word for word in all_words)

    word_features = dimension_reduction(all_words, dist)
    print len(word_features) / float(len(all_words)) * 100.0

    features_set = [(doc_features(doc, word_features), doc['cat']) for doc in docs]
    #train_set, test_set = features_set[:len(docs)/2], features_set[len(docs)/2:len(docs)]