class Text:
    def __init__(self):
        self.stops = self.stopsget()
        self.mystem = Mystem(mystem_bin=None,
                             grammar_info=False,
                             disambiguation=False)
        self.stops_to_nil = [
            re.compile(st) for st in ['[0-9]+', '[.!?"\-,:—%*();»«]+']
        ]

    def stopsget(self):
        with open('finstops.txt') as f:
            stops = [
                re.compile(u'(\s|^){}(\s)'.format(line.strip()))
                for line in f.readlines()
            ]
        return stops

    def normalize(self, text):
        for stop_nil in self.stops_to_nil:
            text = re.sub(stop_nil, '', text)
        for stop in self.stops:
            text = re.sub(stop, '\\1\\2', text.lower())
        text = re.sub('  +', ' ', text)
        text = re.sub('\n ', '\n', text)
        tr = []
        for word in text.split():
            lemm = self.mystem.lemmatize(word)[0]
            tr.append(lemm)
        text = u' '.join(tr)
        return text

    def lemmat(self, line):
        res = [self.mystem.lemmatize(word.lower())[0] for word in line.split()]
        return res
Beispiel #2
0
    def pars(self, result, lemmataize=True):
        '''
            Pars extraction to dict, where keys are tags and they include lists of words for every entity 
            lemmatize -- Flag to lemmatize text with Mystem by Yandex®
            input: extraction result 
            output: dictionary {tag: [[words], [words]]}
        '''

        if lemmataize:
            m = Mystem()
        d = {}
        s = []
        for word, tag in result:
            if tag == 'O':
                if len(s) != 0:
                    if key_tag in d.keys():
                        d[key_tag].append(s)
                    else:
                        d[key_tag] = [s]
                    s = []
                else:
                    continue
            elif tag[0] == 'B':
                key_tag = tag[2:]
                s = []
                if lemmataize and key_tag != 'ORG':
                    word = m.lemmatize(word)[0]
                s.append(word)
            elif tag[0] == 'I':
                if lemmataize and key_tag != 'ORG':
                    word = m.lemmatize(word)[0]
                s.append(word)
        return d
Beispiel #3
0
def mystem_using_with_considering_of_multiple_letters(input_directory, output_directory):
        input_files = filter(lambda x: not x.endswith('~'), os.listdir(input_directory))
        output_data = {}
        m = Mystem()
        #иду по документам
        for input_file in input_files:
            with open(input_directory + '/' + input_file) as data_file:
                data = json.load(data_file)
            list_of_terms = filter(lambda x: x != '', re.split(''' |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +''', data['text']))
            my_list_of_terms = []
            for term in list_of_terms:
                if term == m.lemmatize(term)[0]:
                    my_term = term
                    term = u''
                    prev_letter = my_term[0]
                    term += my_term[0]
                    for i in range(1, len(my_term)):
                        if my_term[i] != prev_letter:
                            term += my_term[i]
                        prev_letter = my_term[i]
                    my_list_of_terms.append(term)
                else:
                    my_list_of_terms.append(term)
            list_of_terms = my_list_of_terms
            text = ' '.join(['%s' % term for term in list_of_terms])
            list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), m.lemmatize(text))
            text_of_output = ' '.join(['%s' % term for term in list_of_terms])
            output_data[input_file] = {}
            output_data[input_file]['id'] = data['id']
            output_data[input_file]['positive'] = data['positive']
            output_data[input_file]['sarcasm'] = data['sarcasm']
            output_data[input_file]['text'] = text_of_output
            with open(output_directory + '/' + input_file, 'w') as output_file:
                json.dump(output_data[input_file], output_file)
Beispiel #4
0
        def lambdaFunc(self, node, A, B):
            # A - is array
            # B - is array
            # item - is object
            m = Mystem()
            if (node["parent"] == "null"):
                return False
            nodeName = ''.join(m.lemmatize(str(node["name"])))
            parentName = ''.join(m.lemmatize(str(node["parent"])))
            lemA = list(map(lambda x: m.lemmatize(str(x))[0], A))
            lemB = list(map(lambda x: m.lemmatize(str(x))[0], B))
            AInNodeName = True
            AInParentName = True
            BInNodeName = True
            BInParentName = True
            for lem in lemA:
                if lem.upper() not in nodeName.upper():
                    AInNodeName = False

            for lem in lemA:
                if lem.upper() not in parentName.upper():
                    AInParentName = False

            for lem in lemB:
                if lem.upper() not in nodeName.upper():
                    BInNodeName = False

            for lem in lemB:
                if lem.upper() not in parentName.upper():
                    BInParentName = False

            if AInNodeName is True and BInParentName is True or BInNodeName is True and AInParentName is True:
                return True
            else:
                return False
def preprocess_text(text):
    mystem = Mystem() 
    rs = ''
    for x in stopwords.words('russian'):
        rs += x + " "
    rs = mystem.lemmatize(rs)
    russian_stopwords = list(set(stopwords.words("russian") + rs +["который","это", "сказать", "/ТАСС/", "тыс", "млн","млрд", "президент", "весь", "год","“","”, - "," “","сообщать",') - ',"”","мочь","также","” ","время",""]))
    mystem = Mystem()
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if (token not in russian_stopwords) and (token != " ") and (token.strip() not in punctuation)]
    
    return tokens
Beispiel #6
0
    def tokeniz(self, df):

        #########COMMENTS#####################
        for i in range(len(df)):
            df["comment"][i] = list(df["comment"][i][2:-2].replace(
                "'", '').split(','))
        tw = TweetTokenizer()
        det = TreebankWordDetokenizer()
        for i in (range(len(df))):
            for j in range(len(df["comment"][i])):
                tokenized_example = (tw.tokenize(df["comment"][i][j]))
                filtered_example = [
                    word for word in tokenized_example
                    if not word in self.sum_noise
                ]
                df["comment"][i][j] = det.detokenize(filtered_example)
        mystem_analyzer = Mystem(entire_input=False)
        for i in (range(len(df))):
            df["comment"][i] = [
                mystem_analyzer.lemmatize(w) for w in df["comment"][i]
            ]
            df["comment"][i] = list(filter(None, df["comment"][i]))
        for i in range(len(df)):
            for j in range(len(df['comment'][i])):
                df['comment'][i][j] = [
                    word for word in df['comment'][i][j]
                    if not word in self.sum_noise
                ]

        ##########POSTS##############
        for i in (range(len(df))):
            tokenized_example = (tw.tokenize(df["post"][i]))
            filtered_example = [
                word for word in tokenized_example
                if not word in self.sum_noise
            ]
            df["post"][i] = det.detokenize(filtered_example)
        for i in (range(len(df))):
            a = []
            a.append(df['post'][i])
            df["post"][i] = a
        for i in (range(len(df))):
            df["post"][i] = [
                mystem_analyzer.lemmatize(w) for w in df["post"][i]
            ][0]
        for i in range(len(df)):
            df['post'][i] = [
                word for word in df['post'][i] if not word in self.sum_noise
            ]

        return df
Beispiel #7
0
 def _get_russian(soup, word = None):
     russian = None
     if(soup.find(class_='t_inline_en')):
         if(word is not None):
             yandex_url = "https://translate.yandex.net/api/v1.5/tr.json/translate?lang=en-ru&format=plain&key=trnsl.1.1.20181026T095610Z.0f9e5b3c50d78498.83dff75a74e7d95e0712640c87b207295ef8842a&text=" + word.replace(' ','%20')
             yandex_url_to = "https://translate.yandex.net/api/v1.5/tr.json/translate?lang=en-ru&format=plain&key=trnsl.1.1.20181026T095610Z.0f9e5b3c50d78498.83dff75a74e7d95e0712640c87b207295ef8842a&text=" +'to%20' + word.replace(' ','%20')
             yandex_translate = urllib.request.urlopen(yandex_url).read()
             yandex_translate_to = urllib.request.urlopen(yandex_url_to).read()
             yd = json.loads(yandex_translate.decode("utf-8"))['text'][0].replace('чтобы','',1).strip().replace('себе','',1).strip()
             yd_to = json.loads(yandex_translate_to.decode("utf-8"))['text'][0]
             russian = soup.find(class_='t_inline_en').text.replace('\u2002',' ').replace('  ',' ').strip()
             mystem = Mystem()
             lemmas = mystem.lemmatize(yd)                    
             ws = russian.split(',')
             b = False
             for idx in range(1, 3):
                 for w in ws:
                     if((not w.find(yd[:-idx]) == -1 or not w.find(lemmas[0][:-idx]) == -1) and b == False):
                         russian = russian.replace(w,'<b>' + w.upper() + '</b>', 1)
                         b = True                             
             if(b == False):
                 wsl = ''
                 for idw, w in enumerate(ws):
                     wsl += str(idw) + ' — ' + w + ', '                  
                 ii = input('Выберети основной перевод слова «' + word + ' (to ' + word + ')» — «'+ yd + ' ('+ yd_to + ')»:\n' + wsl[:-2] + ': ')
                 russian = russian.replace(ws[int(ii)],'<b>' + ws[int(ii)].upper() + '</b>', 1)
                 print(russian)
         else:
             russian = soup.find(class_='t_inline_en').text.replace('\u2002',' ').replace('  ',' ').strip()
     elif(soup.find(class_='light_tr')):
         russian = soup.find(class_='light_tr').text.replace('\u2002',' ').replace('  ',' ').strip()
     return russian
Beispiel #8
0
def tokenize_sentences_lemmatized(rawSentences):
    print('LEMMATIZED total = ' + str(rawSentences.__len__()))
    sentences = []
    m = Mystem()
    index = 0
    for c in rawSentences:
        tokenized_sents = m.lemmatize(c)
        cleaned_set = []
        for tokenized in tokenized_sents:
            if tokenized == "":
                break
            tokenized = tokenized.lower()
            if tokenized in stopwords.words('russian'):
                continue

            token = tokenized[0]
            if (token >= 'а' and token <= 'я'):
                cleaned_set.append(tokenized)
            elif ((token >= 'а' and token <= 'я')
                  or (token >= 'a' and token <= 'z')):
                cleaned_set.append(tokenized)

        if cleaned_set.__len__() > 0:
            sentences.append(cleaned_set)
        if index % 100 == 0:
            print(index)
        index += 1
    return sentences
Beispiel #9
0
class Tokenizer:
    def __init__(self):
        self.space_pattern = re.compile(r'[^.А-ЯA-ZЁ]+', re.I)

        self.m = Mystem()

        try:
            with open('nw_model/stopwords.txt') as f:
                self.stop_words = set(f.read().split('\n')) | {''}
        except FileNotFoundError:
            self.stop_words = set()
            print(
                f'{Fore.RED}WARNING!!! Stop-words file not found!{Style.RESET_ALL}'
            )

    def tokenize_line(self, line):
        """
        Токенизирует одну строку
        :param line:
        :return: набор лексем (pymysteam)
        """
        try:
            return [
                word for word in self.m.lemmatize(
                    self.space_pattern.sub(' ', line.lower()))
                if word.strip() not in self.stop_words
            ]
        except BrokenPipeError:
            self.m = Mystem()
            return self.tokenize_line(line)

    def join(self, lst):
        return self.space_pattern.sub(' ', ' '.join(lst))
Beispiel #10
0
def extract(input_filename, output_filename, number_of_documents, log_step,
            whole_size, index, verbose):
    m = Mystem()
    with open(output_filename, 'w', encoding='utf-8') as csvfile_out:
        with open(input_filename, "r", encoding="utf-8") as csvfile_in:
            datareader = csv.reader(csvfile_in)
            datawriter = csv.writer(csvfile_out)
            abs_step = int(whole_size * log_step / 100)
            count = 0
            for row in datareader:
                if count == 0:
                    datawriter.writerow(
                        insert_lemmatized_text_into_row(
                            row, 'text_lemmas', index))
                else:
                    datawriter.writerow(
                        insert_lemmatized_text_into_row(
                            row, ''.join(m.lemmatize(row[index])), index))
                if (number_of_documents > 0) and (count >=
                                                  number_of_documents):
                    return count
                elif (whole_size > 0) and (log_step > 0) and (
                        abs_step > 0) and (count % abs_step == 0):
                    log_percents(count / whole_size * 100, verbose)
                count += 1
            return count
Beispiel #11
0
class HHParser:
    def __init__(self) -> None:
        self.mystem = Mystem()
        self.term_extractor = rutermextract.TermExtractor()
        self.russian_stopwords = stopwords.words("russian")
        with open(
                os.path.dirname(os.path.realpath(__file__)) + '/models.json',
                'rb') as file:
            self.models = dict(json.load(file))
        nltk.download("stopwords")

    def preprocess_text(self, text: str, word_limit: int):
        tokens = self.mystem.lemmatize(text.lower())
        tokens = [token.split(" ") for token in tokens]
        tokens = np.concatenate(tokens)
        tokens = [token.strip() for token in tokens if token not in self.russian_stopwords \
                  and token != " " \
                  and token.strip() not in punctuation]
        text = " ".join(tokens)
        terms = self.term_extractor(text, limit=word_limit, strings=True)

        return terms

    def answer_questions(self, uid: str, questions: List[str]):
        answers = {}
        for question in questions:
            question_terms = self.preprocess_text(question, 2)
            answer = parsehh(uid, question_terms=question_terms)
            if answer is not None and answer is not {}:
                answers[question] = answer
        return answers
Beispiel #12
0
def collection(folder, stop):
    m = Mystem()
    data = {}
    data_lemmas = []
    for root, dirs, files in os.walk(folder):
        for fname in files:
            f = open(root + '/' + fname, 'r', encoding='utf-8')
            article = f.read()
            f.close()
            article += '.'
            title = re.findall('@ti (.*?)\n', article)[0]
            url = re.findall('@url (.*?)\n', article)[0]
            text = re.findall('@url.*?\n(.*)\.', article, flags=re.DOTALL)[0]
            data[title] = [url, text]

    for key in data:
        wo_stop = []
        data[key][1] = re.sub('\n', ' ', data[key][1])
        data[key][1] = re.sub(' – ', ' ', data[key][1])
        data[key][1] = re.sub('[.,!?:;\'\"\(\)\[\]«»]', '', data[key][1])
        while '  ' in data[key][1]:
            data[key][1] = re.sub('  ', ' ', data[key][1])
        all_words = data[key][1].split(' ')
        dl = len(all_words)
        lemmas = m.lemmatize(data[key][1])
        for lem in lemmas:
            if lem not in stop:
                wo_stop.append(lem)
        data_lemmas.append([key, data[key][0], wo_stop, dl, all_words])

    return data_lemmas, data
def preprocess_text(str1):

    mystem = Mystem()
    tokens = mystem.lemmatize(str1.lower())
    str1 = " ".join(tokens)

    words = []
    for word in str1.split():
        if (word.isalpha()) and (not isEnglish(word)):
            words.append(word)

    res = set()
    for word in words:
        word_adv = word + '_ADJ'
        word_noun = word + '_NOUN'
        try:
            model.similarity(word_adv, 'слово_NOUN')
            res.add(word_adv)
        except BaseException:
            try:
                model.similarity(word_noun, 'слово_NOUN')
                res.add(word_noun)
            except BaseException:
                pass
    return res
def preprocess_text_lemmatize(text, setting="mystem"):
    text = text.lower()
    if text == '@@@':
        return text
    if setting == "mystem":
        mystem = Mystem()
        tokens = mystem.lemmatize(text)
    elif setting == "pymorphy":
        tokens = word_tokenize(text, language="russian")
        morph = pymorphy2.MorphAnalyzer()
        tokens = [morph.parse(token)[0].normal_form for token in tokens]
    else:
        raise Exception('parameter setting should be fill')
    # if len(tokens) == 1 and tokens[0] == "@@@":
    #   return " ".join(tokens)
    tokens = [token for token in tokens if token.strip() not in punctuation]
    tokens = " ".join(tokens)
    tokens = tokens.replace('-', ' ').replace("``", '').replace(
        "''",
        '').replace(".",
                    '').replace("«",
                                '').replace("»",
                                            '').replace("—",
                                                        '').replace("№", '')
    for symbol in punctuation:
        tokens = tokens.replace(symbol, '')
    for symbol in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']:
        tokens = tokens.replace(symbol, '')
    # print(tokens)
    return ' '.join(tokens.split())
Beispiel #15
0
class Lemmatizer(BaseProcessor):
    def __init__(self):
        self.m = Mystem()

    def transform(self, tokens, *args):
        lemm_str = " ".join(tokens)
        return list(filter(lambda s: s.strip(), self.m.lemmatize(lemm_str)))
Beispiel #16
0
def get_tags(text: str) -> List[str]:
    """Get text key words"""
    language = detect(text)
    original_text = text
    normalize_text = None
    keywords_dict = None
    rake_obj = None

    if language == 'ru':
        rake_obj = RAKE.Rake(STOP_WORDS_DIR_RU)
        m = Mystem()
        normalize_text = ''.join(m.lemmatize(original_text))

    if language == 'en':
        rake_obj = RAKE.Rake(STOP_WORDS_DIR_ENG)

    if rake_obj is not None:
        keywords_dict = rake_obj.run(normalize_text,
                                     maxWords=2,
                                     minCharacters=2)
    keywords = []
    if keywords_dict:
        mean_rate = reduce(lambda item1, item2: item1 + item2[1],
                           keywords_dict, 0) / len(keywords_dict)

        keywords = [item[0] for item in keywords_dict if item[1] >= mean_rate]

    return keywords
Beispiel #17
0
def TokenizeSentencesLemmatized(rawSentences, needStemming):
    print('total = ' + str(rawSentences.__len__()))
    sentences = []
    index = 0
    #st = nltk.stem.SnowballStemmer('russian')
    m = Mystem()
    for c in rawSentences:
        #start = time.time()
        tokenized_sents = m.lemmatize(c)
        cleaned_set = []
        for tokenized in tokenized_sents:
            if tokenized == "":
                break
            tokenized = tokenized.lower()
            if tokenized in stopwords.words('russian'):
                continue

            token = tokenized[0]
            if (token >= 'а' and token <= 'я') and needStemming:
                cleaned_set.append(tokenized)
            elif ((token >= 'а' and token <= 'я')
                  or (token >= 'a' and token <= 'z')):
                cleaned_set.append(tokenized)

        if cleaned_set.__len__() > 0:
            sentences.append(cleaned_set)

        #end = time.time()
        #print('Time: ' + str(end - start))
        print(index)
        index += 1
    return sentences
Beispiel #18
0
    def __tokenize_sentences_lemmatized(self, rawSentences):
        sentences = []
        m = Mystem()
        index = 0
        for c in rawSentences:
            logging.warning(str(datetime.now()) + " tokinizeing " + str(index))
            tokenized_sents = m.lemmatize(c)
            cleaned_set = []
            for tokenized in tokenized_sents:
                if tokenized == "":
                    break
                tokenized = tokenized.lower()
                if tokenized in stopwords.words('russian'):
                    continue

                token = tokenized[0]
                if (token >= 'а' and token <= 'я'):
                    cleaned_set.append(tokenized)
                elif ((token >= 'а' and token <= 'я')
                      or (token >= 'a' and token <= 'z')):
                    cleaned_set.append(tokenized)

            if cleaned_set.__len__() > 0:
                sentences.append(cleaned_set)
            index += 1

        return sentences
Beispiel #19
0
def search():
    if request.args:
        search = request.args['search']
        with open('request.txt', 'w', encoding='utf-8') as fl:
            fl.write(search)
        with open('request.txt', 'r', encoding='utf-8') as f1:
            req = f1.read()
            m = Mystem()
            lemma_text = m.lemmatize(req)
            lemma_text = ' ' + str(''.join(lemma_text)) + ' '

        lemma_res = '%' + str(lemma_text).replace('\n', '') + '%'

        conn = sqlite3.connect('newspaper.db')
        c = conn.cursor()

        c.execute("SELECT title, url, plain FROM newspaper WHERE lemma LIKE ?",
                  (lemma_res,))

        rows = c.fetchall()

        res = []
        for row in rows:
            res.append(row)

        return render_template('search.html', search=search, res=res)

    return render_template('search.html')
Beispiel #20
0
def search(request):
    appropriate = defaultdict(float)
    m = Mystem()
    article_data, avdl, inverted_index = main_func()
    N = len(article_data)
    text = re.sub('[&!?*&@#/.,:.,"––)(«»№]', '', request)
    words = [i.lower() for i in text.split()]

    lemmas = []
    for word in words:
        if word not in stopwords.words('russian'):
            lll = m.lemmatize(word)
            lemmas.append(lll[0])

    for lemma in lemmas:
        if lemma in inverted_index:
            lemma_count = inverted_index[lemma]
            n = len(lemma_count)
            for l in lemma_count:
                data = article_data[l[0]]
                qf = l[1]
                dl = data[2]
                appropriate[(data[0],
                             data[1])] += score_BM25(n, qf, N, dl, avdl)

    result = sorted(appropriate)
    return result
Beispiel #21
0
    def detect_event(self, add_detected_word=False):
        """
        Метод, определяющий, есть ли в переданном сообщении информация про событие
        :param add_detected_word: bool, default=False
            Добавлять ли в начало сообщения слово, по которому было детектировано событие
            :return: bool
        """

        # Флаг, определяющий, содержит ли данный текст информацию про событие
        is_event = False

        # Слово, по которому сработал алгоритм
        detected_word = ''

        # Инициализация лемматизатора
        lemmatizer = Mystem(grammar_info=False, entire_input=False)

        # Перебор всех слов
        for norm_word in lemmatizer.lemmatize(
                self.convert_text(self.message.description)):

            # Есть ли данное слово в словаре слов про ИБ
            if norm_word in self.DETECT_EVENT_SET:
                is_event = True
                detected_word = norm_word
                break

        if add_detected_word:
            self.message.description = '#' + detected_word + '\n' + self.message.description

        return is_event
Beispiel #22
0
class Lemmatisation(object):
    def __init__(self):
        self.ru_lem = Mystem()
        self.en_lem = nltk.stem.WordNetLemmatizer()

        self.ru_stop_words = set(
            nltk.corpus.stopwords.words('russian') +
            [chr(i) for i in range(ord('а'),
                                   ord('я') + 1)])
        self.en_stop_words = set(
            nltk.corpus.stopwords.words('english') +
            [chr(i) for i in range(ord('a'),
                                   ord('z') + 1)])

    def visible(self, term):
        if re.search(
                NOT_DIGIT_OR_LETTER, term
        ) or term in self.ru_stop_words or term in self.en_stop_words:
            return False
        return True

    def _lemmatize(self, doc):
        lemmas = self.ru_lem.lemmatize(doc)
        lemmas = [
            self.en_lem.lemmatize(lemma) for lemma in lemmas
            if self.visible(lemma)
        ]
        return ' '.join(lemmas)

    def lemmatize(self, doc_id, doc):
        try:
            return self._lemmatize(doc)
        except Exception as e:
            print(doc_id, e)
Beispiel #23
0
def main_func():
    all_lemmas = {}
    article_data = {}
    avdl = 0

    for article in os.listdir('./articles'):
        textwithtags = open('./articles/' + article, 'r',
                            encoding='utf-8-sig').read()
        url = re.findall('@url (.*)', textwithtags)[0]
        name = re.findall('@ti (.*)', textwithtags)[0]
        text = re.findall('article=[0-9]+(.*)', textwithtags, flags=re.DOTALL)
        m = Mystem()
        if len(text) > 0:
            text = text[0]
            text = re.sub('[&!?*&@#/.,:.,"––)(«»№]', '', text)
            words = [i.lower() for i in text.split()]

            lemmas = []
            for word in words:
                if word not in stopwords.words('russian'):
                    lll = m.lemmatize(word)
                    lemmas.append(lll[0])

            all_lemmas[article] = lemmas
            article_data[article] = (url, name, len(lemmas))
            avdl += len(lemmas)

    avdl = avdl / len(all_lemmas)
    inverted_index = invert_index(all_lemmas)
    return article_data, avdl, inverted_index
Beispiel #24
0
class Lemmatizer:
    def __init__(self, stop_words = None):
        self.stemmer = Mystem()
        self.cache = dict()#MyCache(maxsize=1000000)
        stop_words = stop_words if stop_words is not None else []
        self.stop_words = set(stop_words + [' ', '\n', '\r\n', '\t'])
        pass

    def lemmatize_word(self, word):
        res = self.cache.get(word, None)
        if res is not None:
            return res

        lm = self.stemmer.lemmatize(word)
        lm = [w for w in lm if w not in self.stop_words]

        if len(lm) == 0:
            return None

        lemmatized_word = max(lm, key=lambda x: len(x))



        self.cache[word] = lemmatized_word

        return lemmatized_word

    def fit_transform(self, words):
        if len(words) == 0:
            return []

        res = [self.lemmatize_word(w) for w in words]
        res = [w for w in res if w is not None]
        return res
Beispiel #25
0
def process_text(text,
                 min_word_size=4,
                 min_sent_size=10,
                 extra_stop=None,
                 remove_short=True):
    html_cleaner = re.compile('<.*?>')
    cyrillic = re.compile(r'[^а-яА-Я ]')
    esc_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
    extra_spaces = re.compile(r'\s{2,}')
    stop_list = stopwords.words('russian')
    if extra_stop:
        stop_list.extend([''])
    lmtzr = Mystem()

    n_sents = len(text)
    for i in tqdm(range(n_sents)):
        # html removal
        text[i] = html_cleaner.sub(' ', text[i])  # there's NO NoneType error
        # punctuation and numbers removal
        text[i] = esc_punctuation.sub(' ', text[i])
        # leaving only cyrillic words
        text[i] = cyrillic.sub(' ', text[i])
        # extrace spacing
        text[i] = extra_spaces.sub(' ', text[i])
        # lemmatization (Hint: look to pymystem3 docs)
        text[i] = ''.join(lmtzr.lemmatize(text[i])).strip()
        # stopwords removal + lowercasing
        text[i] = ' '.join([word.lower() for word in text[i].split() \
                            if len(word) >= min_word_size and word not in stop_list])

    text = [sent for sent in text if len(sent) >= min_sent_size]
    print('done!')
    return text
Beispiel #26
0
class NewsTextDataset:
    def __init__(self):
        self.data = []
        self.unique_ids = []
        self.mystem = Mystem()
        self.russian_stopwords = stopwords.words("russian")

    def append(self, article: Article):
        if article.article_id not in self.unique_ids:
            self.unique_ids.append(article.article_id)
            self.data.append(article)
            return True
        else:
            return False

    def save(self, path):
        with open(path, "w") as fp:
            data = {
                "catalog": [ob.__dict__ for ob in self.data],
            }
            json.dump(
                data,
                fp,
                sort_keys=True,
                indent=4,
                ensure_ascii=False,
            )

    def load(self, path):
        with open(path) as json_file:
            data = json.load(json_file)
        self.data = [Article(dict_object=obj) for obj in data["catalog"]]

    def preprocess(self):
        for idx, article in tqdm(enumerate(self.data)):
            # r"[a-zA-Z]|\$|\d*|\(|\)|/@"
            pattern = r"[^а-яА-Я\s]"
            text = re.sub(pattern, "", article.text)
            tokens = self.mystem.lemmatize(text.lower())
            tokens = [
                token for token in tokens
                if token not in self.russian_stopwords and token != " "
                and token.strip() not in punctuation and ad.is_cyrillic(token)
            ]
            article.tokenized_text = tokens
            self.update(article, idx)

    def dump_to_pandas(self):
        return pd.DataFrame.from_records(
            [article.to_dict() for article in self.data], )

    def __len__(self):
        return len(self.data)

    def update(self, article, idx):
        self.data[idx].tokenized_text = article.tokenized_text

    # Useless for now
    def __getitem__(self, idx):
        return self.data[idx]
Beispiel #27
0
class ActionDocs(Action):
    def __init__(self):
        self.m = Mystem()
        self.countries = json.load(open(file, "r"))

    def name(self) -> Text:
        return "action_get_docs"

    def run(self, dispatcher: CollectingDispatcher, tracker: Tracker,
            domain: Dict[Text, Any]) -> List[Dict[Text, Any]]:
        input_country = tracker.get_slot('country')
        lemmas = self.m.lemmatize(input_country)
        country = lemmas[0].capitalize()
        found = False
        for i in self.countries:
            if country == i["country"] or input_country == i["country"].lower(
            ):
                if input_country == i["country"]:
                    country = i["country"]
                found = True
                dispatcher.utter_message(text=f"{i['documents']}")
                break

        if not found:
            dispatcher.utter_message(
                text=f"Я не знаю такую страну '{country}'")

        return []
def func_lemma(text):
    m = Mystem()  # lemmatization model
    lemmas = []
    for i in range(len(text)):
        # join all words in i-topic after lemmatization
        lemmas.append(''.join(m.lemmatize(text[i])))
    return lemmas
Beispiel #29
0
class ActionInZone(Action):
    def __init__(self):
        self.countries = json.load(open(file, "r"))
        self.m = Mystem()
        self.schengens = [
            "Австрия", "Бельгия", "Чешская Республика", "Дания", "Эстония",
            "Финляндия", "Франция", "Германия", "Греция", "Венгрия",
            "Исландия", "Италия", "Латвия", "Литва", "Люксембург", "Мальта",
            "Голландия", "Норвегия", "Польша", "Португалия", "Словакия",
            "Словения", "Испания", "Швеция", "Швейцария", "Лихтенштейн"
        ]

    def name(self) -> Text:
        return "is_schengen_zone"

    def run(self, dispatcher: CollectingDispatcher, tracker: Tracker,
            domain: Dict[Text, Any]) -> List[Dict[Text, Any]]:
        country = tracker.get_slot('country')
        lemmas = self.m.lemmatize(country)
        country = lemmas[0].capitalize()
        if country in self.schengens:
            dispatcher.utter_message(text=f"{country} член шенгенской зоны")
        else:
            dispatcher.utter_message(
                text=f"{country} не входит в шенгенскую зону")

        return []
Beispiel #30
0
class Word2vecProcessor(object):
    """Объект для работы с моделью word2vec сходства слов"""
    def __init__(self, w2v_model_file):
        self.mystem = Mystem()
        self.word2vec = KeyedVectors.load_word2vec_format(w2v_model_file,
                                                          binary=True)
        self.lemma2word = {
            word.split('_')[0]: word
            for word in self.word2vec.index2word
        }

    def word_vector(self, word):
        lemma = self.mystem.lemmatize(word)[0]
        word = self.lemma2word.get(lemma)
        return self.word2vec[word] if word in self.word2vec else None

    def text_vector(self, text):
        """Вектор текста, получается путем усреднения векторов всех слов в тексте"""
        word_vectors = [
            self.word_vector(token) for token in word_tokenize(text.lower())
            if token.isalpha()
        ]
        word_vectors = [vec for vec in word_vectors if vec is not None]
        return np.mean(word_vectors, axis=0)

    def distance(self, vec1, vec2):
        if vec1 is None or vec2 is None:
            return 2
        return cosine(vec1, vec2)
class TextsLematizer():
    def __init__(self):
        self.m = Mystem()

    # функция, проводящая предобработку текста
    def text_hangling(self, text: str):
        try:
            txt = re.sub('[^a-zа-я\d]', ' ', text.lower())
            txt = re.sub('\s+', ' ', txt)
            # сюда можно будет вложить самую разную обработку, в том числе и вариационную
            return txt
        except:
            return ""

    # функция лемматизации одного текста
    def text_lemmatize(self, text: str):
        try:
            lemm_txt = self.m.lemmatize(text)
            lemm_txt = [w for w in lemm_txt if w not in [' ', '\n']]
            return lemm_txt
        except:
            return ['']

    # функция лемматизации списка текстов текста
    def texts_lemmatize(self, texts_list):
        return [
            self.text_lemmatize(self.text_hangling(tx)) for tx in texts_list
        ]
    def extract(self):
        try:
            #вычисляем, сколько в директории лежит файлов
            input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
            output_data = {}
            list_of_all_terms = {}
            m = Mystem()
            #иду по документам
            for file in input_files:
                with open(self.input_directory + '/' + file) as data_file:
                    data = json.load(data_file)
                list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|<|>|\*|!|@|_ +""", data['text']))
                text = " ".join(["%s" % term for term in list_of_terms])
                list_of_terms = filter(lambda x: x != " ", m.lemmatize(text))
                count_of_rows = 0
                for i in range(0, len(list_of_terms)):
                    if list_of_terms[i] == '\n' or list_of_terms[i] == ' \n':
                        count_of_rows += 1
                    if list_of_terms[i] == ' \n':
                        list_of_terms[i] = '\n'
                if count_of_rows < self.threshold_of_rows_count:
                    first_list_of_terms = list_of_terms
                    list_of_terms = []
                    for i in range(0, len(first_list_of_terms)):
                        if first_list_of_terms[i] != '\n':
                            list_of_terms.append(first_list_of_terms[i])
                output_data[file] = {}
                output_data[file]['id'] = data['id']
                output_data[file]['positive'] = data['positive']
                output_data[file]['sarcasm'] = data['sarcasm']
                output_data[file]['terms'] = {}
                #убираю повторяющиеся слова
                for term in list_of_terms:
                    if term not in output_data[file]['terms']:
                        output_data[file]['terms'][term] = 1
                    else:
                        output_data[file]['terms'][term] += 1
                for term in output_data[file]['terms'].keys():
                    if term not in list_of_all_terms:
                        list_of_all_terms[term] = 1
                    else:
                        list_of_all_terms[term] += 1
                    #подсчёт tf
                    count_of_terms = output_data[file]['terms'][term]
                    output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0,
                                                        'count': count_of_terms}

            for file in input_files:
                #подсчёт idf
                for term in output_data[file]['terms'].keys():
                    output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term])
                #запись результата
                with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
                    json.dump(output_data[file], output_file)
        except Exception:
            return False
        else:
            return True
    def extract(self):
        try:
            #вычисляем, сколько в директории лежит файлов
            input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
            output_data = {}
            list_of_all_terms = {}
            m = Mystem()
            #иду по документам
            for file in input_files:
                with open(self.input_directory + '/' + file) as data_file:
                    data = json.load(data_file)
                list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text']))
                text = " ".join(["%s" % term for term in list_of_terms])
                list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))
                my_list = list_of_terms
                list_of_terms = []
                for term in my_list:
                    if m.analyze(term)[0].get(u'analysis'):
                        if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith(self.service_parts_of_speech) and len(term) > 1:
                            list_of_terms.append(term)
                        if term == u'не':
                            list_of_terms.append(term)
                    else:
                        list_of_terms.append(term)
                output_data[file] = {}
                output_data[file]['id'] = data['id']
                output_data[file]['positive'] = data['positive']
                output_data[file]['sarcasm'] = data['sarcasm']
                output_data[file]['terms'] = {}
                #убираю повторяющиеся слова
                for term in list_of_terms:
                    if term not in output_data[file]['terms']:
                        output_data[file]['terms'][term] = 1
                    else:
                        output_data[file]['terms'][term] += 1
                for term in output_data[file]['terms'].keys():
                    if term not in list_of_all_terms:
                        list_of_all_terms[term] = 1
                    else:
                        list_of_all_terms[term] += 1
                    #подсчёт tf
                    count_of_terms = output_data[file]['terms'][term]
                    output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0,
                                                        'count': count_of_terms}

            for file in input_files:
                #подсчёт idf
                for term in output_data[file]['terms'].keys():
                    output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term])
                #запись результата
                with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
                    json.dump(output_data[file], output_file)
        except Exception:
            return False
        else:
            return True
Beispiel #34
0
class Runner(object):
    def __init__(self, input_text):
        self.lemmatize = None
        while True:
            response = raw_input("Do you want to lemmatize text first? (yes/no)\n").lower()
            if response == "yes":
                print "You should wait for a while"
                self.lemmatize = True
                self.stemmer = Mystem()
                break
            elif response == "no":
                self.lemmatize = False
                break

        self.word_lists = list()
        with open(input_text, "r") as f:
            for line in f:
                line += "."
                if self.lemmatize:
                    lexemes = self.stemmer.lemmatize(line)
                    word_list = list()  # список слов, неразделенных знаками пунктуации
                    for lexeme in lexemes:
                        lexeme = lexeme.strip()
                        if lexeme:
                            if lexeme.translate(None, '.,?!:;()"\' -\t\n'):  # проверка, что лексема не является знаком пунктуации
                                lexeme = lexeme.decode("utf-8")
                                if is_cyrillic(lexeme):
                                    word_list.append(lexeme)
                            else:  # иначе, добавить биграмы из списка и завести новый пустой список
                                self.word_lists.append(word_list)
                                word_list = list()
                else:
                    line = line.replace(".", " . ").replace(",", " , ").replace(":", " : ").replace(";", " ; ")\
                        .replace("?", " ? ").replace("!", " ! ").replace("(", " ( ").replace(")", " ) ")\
                        .replace("--", " -- ").replace(".", " . ")
                    word_list = list()
                    for lexeme in line.split():
                        # проверка, что лексема не является знаком пунктуации
                        lexeme = lexeme.translate(None, '.,?!:;()"\'').replace("--", "").decode("utf-8").strip().lower()
                        if lexeme:
                            if is_cyrillic(lexeme):
                                word_list.append(lexeme)
                        else:
                            if word_list:
                                self.word_lists.append(word_list)
                            word_list = list()

        train, test = self.split()
        self.lid = Lid(train, test)
        self.lid.run()

    def split(self):
        n = len(self.word_lists)
        train = self.word_lists[:n*9/10]
        test = self.word_lists[n*9/10:]
        return train, test
Beispiel #35
0
class Index(object):

    def __init__(self, input_file):
        self.stemmer = Mystem()
        self.documents = dict()
        self.tokens = list()
        self.terms = dict()
        self.index = list()

        # reading documents, making tokenization
        with open(input_file, "r") as f:
            for i, line in enumerate(f, start=1):
                self.documents[i] = line.decode("utf-8")
                for word in self.stemmer.lemmatize(line):
                    token = word.translate(None, '.,?!:;()"\'-').decode("utf-8").strip()
                    if token:
                        self.tokens.append((token, i))

        # sorting by tokens first, then by frequency
        self.tokens.sort(key=lambda tup: (tup[0], tup[1]))

        # terminization and building index
        current_term = self.tokens[0][0]
        current_doc_id = self.tokens[0][1]
        doc_ids = [current_doc_id]
        for token, doc_id in self.tokens:
            term = token.lower()
            if term == current_term:
                if doc_id != current_doc_id:
                    doc_ids.append(doc_id)
                    current_doc_id = doc_id
            else:
                self.terms[current_term] = (len(doc_ids), doc_ids)
                self.index.append((current_term, len(doc_ids), doc_ids))
                current_term = term
                current_doc_id = doc_id
                doc_ids = [doc_id]
        self.terms[current_term] = (len(doc_ids), doc_ids)
        self.index.append((current_term, len(doc_ids), doc_ids))

    def print_to_file(self):
        with open("result.txt", "w") as f:
            for term, count, doc_ids in self.index:
                f.write("{},\t{},\t{}\n".format(term.encode("utf-8"), count, doc_ids))

    def print_statistics(self):
        terms_num = len(self.terms)
        terms_len = 0.
        for term in self.terms:
            terms_len += len(term)

        print "***********************"
        print "Number of terms = {}".format(terms_num)
        print "Average term length = {}".format(terms_len / terms_num)
        print "***********************"
 def extract(self):
     try:
         #вычисляем, сколько в директории лежит файлов
         input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
         output_data = {}
         list_of_all_n_grams = {}
         m = Mystem()
         #иду по документам
         for file in input_files:
             with open(self.input_directory + '/' + file) as data_file:
                 data = json.load(data_file)
             list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text']))
             text = " ".join(["%s" % term for term in list_of_terms])
             list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))
             list_of_n_grams_tuples = {}
             for j in range(0, self.n):
                 list_of_n_grams_tuples[j] = zip(*[list_of_terms[i:] for i in range(j + 1)])
             list_of_n_grams_strings = []
             for j in range(0, self.n):
                 for gram_tuple in list_of_n_grams_tuples[j]:
                     string_of_n_gram = " ".join(["%s" % term for term in gram_tuple])
                     list_of_n_grams_strings.append(string_of_n_gram)
             output_data[file] = {}
             output_data[file]['id'] = data['id']
             output_data[file]['positive'] = data['positive']
             output_data[file]['sarcasm'] = data['sarcasm']
             output_data[file]['terms'] = {}
             #убираю повторяющиеся слова
             for gram in list_of_n_grams_strings:
                 if gram not in output_data[file]['terms']:
                     output_data[file]['terms'][gram] = 1
                 else:
                     output_data[file]['terms'][gram] += 1
             for gram in output_data[file]['terms'].keys():
                 if gram not in list_of_all_n_grams:
                     list_of_all_n_grams[gram] = 1
                 else:
                     list_of_all_n_grams[gram] += 1
                 #подсчёт tf
                 count_of_n_grams = output_data[file]['terms'][gram]
                 output_data[file]['terms'][gram] = {'tf': float(count_of_n_grams)/len(list_of_n_grams_strings), 'idf': 0,
                                                     'count': float(count_of_n_grams)}
         for file in input_files:
             #подсчёт idf
             for gram in output_data[file]['terms'].keys():
                 output_data[file]['terms'][gram]['idf'] = math.log(float(len(input_files))/list_of_all_n_grams[gram])
             #запись результата
             with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
                 json.dump(output_data[file], output_file)
     except Exception:
         return False
     else:
         return True
Beispiel #37
0
class Index(object):
    def __init__(self, input_file):
        self.stemmer = Mystem()
        self.tokens = list()
        self.index = dict()
        self.number_of_documents = 0

        try:
            self.read_from_file_compressed("index_compressed.txt")
        except:
            # reading documents, making tokenization
            with open(input_file, "r") as f:
                for line in f:
                    self.number_of_documents += 1
                    # self.documents[i] = line.decode("utf-8")
                    for word in self.stemmer.lemmatize(line):
                        token = word.translate(None, '.,?!:;()"\'-').decode("utf-8").strip()
                        if token:
                            self.tokens.append((token, self.number_of_documents))

            # sorting by tokens first, then by frequency
            self.tokens.sort(key=lambda tup: (tup[0], tup[1]))

            # terminization and building index
            current_term = self.tokens[0][0]
            current_doc_id = self.tokens[0][1]
            doc_ids = [current_doc_id]
            for token, doc_id in self.tokens:
                term = token.lower()
                if term == current_term:
                    if doc_id != current_doc_id:
                        doc_ids.append(doc_id)
                        current_doc_id = doc_id
                else:
                    self.index[current_term] = (len(doc_ids), pack_doc_ids(doc_ids))
                    current_term = term
                    current_doc_id = doc_id
                    doc_ids = [doc_id]
            self.index[current_term] = (len(doc_ids), pack_doc_ids(doc_ids))
            del self.tokens
            self.write_index_in_file()

    def write_index_in_file(self):
        with open("index_compressed.txt", "w") as f:
            pickle.dump(self.index, f)

    def read_from_file_compressed(self, index_file):
        with open(index_file, "r") as f:
            self.index = pickle.load(f)
Beispiel #38
0
def mystem_using(input_directory, output_directory):
    input_files = filter(lambda x: not x.endswith('~'), os.listdir(input_directory))
    output_data = {}
    m = Mystem()
    for input_file in input_files:
        with open(input_directory + '/' + input_file) as data_file:
            data = json.load(data_file)
        list_of_terms = filter(lambda x: x != '', re.split(''' |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +''', data['text']))
        text = " ".join(["%s" % term for term in list_of_terms])
        list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), m.lemmatize(text))
        text_of_output = ' '.join(['%s' % term for term in list_of_terms])
        output_data[input_file] = {}
        output_data[input_file]['id'] = data['id']
        output_data[input_file]['positive'] = data['positive']
        output_data[input_file]['sarcasm'] = data['sarcasm']
        output_data[input_file]['text'] = text_of_output

        with open(output_directory + '/' + input_file, 'w') as output_file:
                    json.dump(output_data[input_file], output_file)
def search():
    cn = None
    file = codecs.open('static/articles.xml', 'r', 'utf-8')
    rfile = file.read()
    tree = lxml.etree.fromstring(rfile)
    res = tree.xpath('entry')
    categ = {
        'cat': 'Категория', 'wgroup': 'Группа слов с близким значением', 'comm': 'Комментарии',
        'stdiff': 'Стилистические различия', 'overlap': 'Совпадающая часть значения',
        'dom': 'Доминанта группы', 'diffmark': 'Различительные признаки, релевантные для данной группы',
        'diff': 'Смысловые различия', 'rare': 'Редкие слова, примыкающие к группе',
        'anmean': 'Другие значения слов, входящих в группу', 'comb': 'Сочетаемость', 'reg': 'Региональные варианты',
        'adict': 'Данные академических словарей', 'doc': 'Нормативные документы',
        'etim': 'Этимология', 'ill': 'Иллюстрации'
    }
    file.close()
    ms = Mystem()
    wordsearch = ms.lemmatize(request.form['search'].lower())[0]

    for i in res:
        if wordsearch == '':
            cn = 'Пустой запрос'
        elif i.text.lower().startswith(wordsearch):
            arr = []
            for j in i.iter():
                for k in dict.keys(categ):
                    if j.tag == k:
                        if j.text != 'null':
                            arr.append('<font size="4"><b>' + str(categ[j.tag]) + '</b></font><br>' + str(j.text))
                text = '<br><br>'.join([j for j in arr[1:]])
                text = re.sub('\*', '<b>', text)
                text = re.sub('\#', '</b>', text)
                text = re.sub('\$', '<i>', text)
                text = re.sub('\%', '</i>', text)
                text = re.sub('\@', '<font color="#696969">', text)
                text = re.sub('\+', '</font>', text)
                cn = '<strong><big>' + i.text + '</big></strong><br><br>' + re.sub('\n', '<br>', text)
            break
        else:
            cn = 'По Вашему запросу ничего не найдено. <br>' \
                 'Попробуйте использовать "Поиск по тегу" или измените запрос.'
    return render_template('search.html', cn=Markup(cn))
Beispiel #40
0
    def __init__(self, input_text):
        self.number_of_words = 0
        self.number_of_bigrams = 0
        self.words_frequency = dict()
        self.bigrams_frequency = dict()
        self.words_position = dict()  # как часто слово W находится в первой и во второй позиции в биграмме

        while True:
            response = raw_input("Do you want to lemmatize text first? (yes/no)\n").lower()
            if response == "yes":
                print "You should wait for a while"
                LEMMATIZE = True
                stemmer = Mystem()
                break
            elif response == "no":
                LEMMATIZE = False
                break

        with open(input_text, "r") as f:
            for i, line in enumerate(f, start=1):
                line = line + "."
                if LEMMATIZE:
                    lexemes = stemmer.lemmatize(line)
                    words_list = list()  # список слов, неразделенных знаками пунктуации
                    for lexeme in lexemes:
                        lexeme = lexeme.strip()
                        if lexeme:
                            if lexeme.translate(None, '.,?!:;()"\' -\t\n'):  # проверка, что лексема не является знаком пунктуации
                                lexeme = lexeme.decode("utf-8")
                                if is_cyrillic(lexeme):
                                    words_list.append(lexeme)
                            else:  # иначе, добавить биграмы из списка и завести новый пустой список
                                n = len(words_list)
                                if n > 1:
                                    w1 = words_list[0]
                                    self.__add_word(w1)
                                    for w2 in words_list[1:]:
                                        self.__add_word(w2)
                                        self.__add_bigram(w1, w2)
                                        w1 = w2
                                words_list = list()
                else:
                    line = line.replace(".", " . ").replace(",", " , ").replace(":", " : ").replace(";", " ; ")\
                        .replace("?", " ? ").replace("!", " ! ").replace("(", " ( ").replace(")", " ) ")\
                        .replace("--", " -- ").replace(".", " . ")
                    words_list = list()
                    for lexeme in line.split():
                        # проверка, что лексема не является знаком пунктуации
                        lexeme = lexeme.translate(None, '.,?!:;()"\'').replace("--", "").decode("utf-8").strip().lower()
                        if lexeme:
                            if is_cyrillic(lexeme):
                                words_list.append(lexeme)
                        else:
                            n = len(words_list)
                            if n > 1:
                                w1 = words_list[0]
                                self.__add_word(w1)
                                for w2 in words_list[1:]:
                                    self.__add_word(w2)
                                    self.__add_bigram(w1, w2)
                                    w1 = w2
                            words_list = list()

                if i % 1000 == 0:
                    print "Computing line {}".format(i)
            print "total words = {}".format(self.number_of_words)
            print "unique words = {}".format(len(self.words_frequency))
            print "total bigrams = {}".format(self.number_of_bigrams)
            print "unique bigrams = {}".format(len(self.bigrams_frequency))

        with open("bigrams.txt", "w") as f:
            bigrams = list(self.bigrams_frequency.items())
            bigrams.sort(key=lambda tup: (-tup[1], tup[0]))
            for bigram in bigrams:
                f.write("{}\n".format(bigram[0].encode("utf-8")))
Beispiel #41
0
from pymystem3 import Mystem
m = Mystem()

t = 'Чайника, сегодня не было'
lemma = m.lemmatize(t)


def lemmas(text):
    punc = list('.?!-;:",')
    text = [i for i in text if i not in punc]
    text = ''.join(text)
    text = m.lemmatize(text)
    textn = ''
    for w in text:
        if w is not ' ' or '\n':
            textn += w
    return textn


from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import os

s_w = stopwords.words('russian')
sw = [i for i in s_w]

v = TfidfVectorizer(stop_words=sw) # убираем стоп-слова
#v = TfidfVectorizer() # не убираем стоп-слова

totalCorpus = []
suspenseCorpus = ''
Beispiel #42
0
    with open(file_in) as parsed_in, \
         open("..\\data\\stemmed\\" + name + "_mystem.tsv", "wb") as mystem_out:
         # open("..\\data\\stemmed\\" + name + "_porter.tsv", "wb") as porter_out, \

        parsed_in = csv.reader(parsed_in, delimiter='\t')
        mystem_out = csv.writer(mystem_out, delimiter='\t') #, quoting=csv.QUOTE_NONE

        mystem = Mystem()
        prep_counter = 0

        for row in parsed_in:
            exclude = ['\'', '\"', '.', ',', '!', '?', u'«', u'»']
            s = ''.join(ch for ch in row[1].decode("utf-8") if ch not in exclude)

            stemmed_tokens = m.lemmatize(s)
            stemmed_tokens = [token if emoticon_re.search(token) else token.lower() for token in stemmed_tokens]

            # punctuation = list(string.punctuation.decode("utf-8"))
            # stop = punctuation
            # stop = ['!', '"', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
            #         ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~'] #'@',
            stop = ['rt', 'via', '...', "…".decode("utf-8")]
            stemmed_tokens = [token if token not in stop else '' for token in stemmed_tokens]

            stemmed_str = "".join([token for token in stemmed_tokens])
            mystem_out.writerow([row[0], stemmed_str.encode("utf-8").replace('\n', ' ')])

            # Print a status message every 1000th review
            if prep_counter % 100. == 0.:
                print "Lemmatize %d strings" % (prep_counter)
Beispiel #43
0
with open("../data/" + PREFIX + "norm_sentences.txt", "w") as writer:
    count = 0
    raw = []
    normalized = []

    for line in open("../data/" + PREFIX + "parsed.txt"):

        if count % 1000 == 0:
            print count

        line = re.sub("[\W\d]+", " ", line.strip().decode("utf-8").strip(), 0, re.UNICODE)
        line = re.sub("\s+", " ", line.strip(), 0, re.UNICODE).lower()
        raw.extend(line.split(" "))
        writer.write("* " + line.encode("utf-8") + " **;")
        # print line, '->',
        line = " ".join(normalizer.lemmatize(line))
        line = re.sub("\s+", " ", line, 0, re.UNICODE)
        lemmatized = filter(lambda x: len(x.strip()) > 0, normalizer.lemmatize(line))
        normalized.extend(lemmatized)
        # print line
        writer.write("* " + " ".join(lemmatized).encode("utf-8") + " **\n")

        count += 1

# print 'saving raw'
#
# with open("../data/raw_terms.txt", "w") as f:
#     for term in set(raw):
#         f.write(term.encode("utf-8") + "\n")
#
# print 'saving norm'
# Using pymystem3 lemmatize texts
import sys
from pymystem3 import Mystem

text = sys.argv[1]
m = Mystem()

lemmas = m.lemmatize(text)

print(''.join(lemmas))
Beispiel #45
0
 def test_mystem_abc(self):
     m = Mystem()
     tokens = m.lemmatize("ABC")
     assert ["ABC", "\n"] == tokens
Beispiel #46
0
 def test_mystem_not_entireinput(self):
     m = Mystem(entire_input=False)
     tokens = m.lemmatize("Мама мыла раму")
     assert ["мама", "мыть", "рама"] == tokens
Beispiel #47
0
 def test_mystem(self):
     m = Mystem()
     tokens = m.lemmatize("Мама мыла раму")
     assert ["мама", " ", "мыть", " ", "рама", "\n"] == tokens
Beispiel #48
0
def lemma(text):
    m = Mystem()
    lemmas = m.lemmatize(text)
    titleStemmed = ''.join(lemmas)
    return titleStemmed
		temporal.append(lines[i])
	if temporal:
		if lines[i].find('>0<') != -1:
			temporal.append('0')
		elif lines[i].find('>-1<') != -1:
			temporal.append('-1')
		elif lines[i].find('>1<') != -1:
			temporal.append('1')
		if len(temporal) == 2:
		    strings.append(temporal[0])
		    labels.append(temporal[1])
		    temporal = []

#lemmatization
mystem = Mystem()
lemmas_norm = list(set(mystem.lemmatize(words_str)))

#cleaning up the sentences
rus_symbols = re.compile('[а-я]|\s')
text = ''

for string in strings:
	for symbol in string:
	    if rus_symbols.search(symbol):
		    text += symbol
	text += '. '

sentences = text.split('. ')

for i in range(len(sentences)):
    sentences[i] = sentences[i].strip()
    def extract(self):
        try:
            #вычисляем, сколько в директории лежит файлов
            input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
            output_data = {}
            list_of_all_terms = {}
            m = Mystem()
            #иду по документам
            for file in input_files:
                with open(self.input_directory + '/' + file) as data_file:
                    data = json.load(data_file)
                list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text']))
                text = " ".join(["%s" % term for term in list_of_terms])
                list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))


                # обработка не + (слово)
                nums_of_bigrams = []
                helping_words = [u'совсем', u'очень', u'слишком', u'самый']
                for i in range(0, len(list_of_terms)):
                    if list_of_terms[i] == u'не' and list_of_terms[i+1] not in helping_words:
                        nums_of_bigrams.append((i, i+1))
                    elif list_of_terms == u'не' and list_of_terms[i+1] in helping_words:
                        nums_of_bigrams.append((i, i+2))
                for i in range(0, len(nums_of_bigrams)):
                    if nums_of_bigrams[i][0] + 1 == nums_of_bigrams[i][1]:
                        list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + ' ' + list_of_terms[nums_of_bigrams[i][1]]
                        list_of_terms[nums_of_bigrams[i][1]] = ''
                    elif nums_of_bigrams[i][0] + 2 == nums_of_bigrams[i][1]:
                        list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + ' ' + list_of_terms[nums_of_bigrams[i][1]]
                        list_of_terms[nums_of_bigrams[i][1] - 1] = ''
                        list_of_terms[nums_of_bigrams[i][1]] = ''
                list_of_terms = filter(lambda x: x != '', list_of_terms)

                output_data[file] = {}
                output_data[file]['id'] = data['id']
                output_data[file]['positive'] = data['positive']
                output_data[file]['sarcasm'] = data['sarcasm']
                output_data[file]['terms'] = {}
                #убираю повторяющиеся слова
                for term in list_of_terms:
                    if term not in output_data[file]['terms']:
                        output_data[file]['terms'][term] = 1
                    else:
                        output_data[file]['terms'][term] += 1
                for term in output_data[file]['terms'].keys():
                    if term not in list_of_all_terms:
                        list_of_all_terms[term] = 1
                    else:
                        list_of_all_terms[term] += 1
                    #подсчёт tf
                    count_of_terms = output_data[file]['terms'][term]
                    output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0,
                                                        'count': count_of_terms}

            for file in input_files:
                #подсчёт idf
                for term in output_data[file]['terms'].keys():
                    output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term])
                #запись результата
                with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
                    json.dump(output_data[file], output_file)
        except Exception:
            return False
        else:
            return True
API_KEY = "api_key"

if __name__ == "__main__":
    not_translated = []
    dictionary = {}
    print(len(dictionary.keys()))
    m = Mystem()
    df = pandas.read_csv("/media/alexander/b32bf4b4-8724-4107-9d19-abf6615c2f60/alexander/HELP_FILE/query.yaHotelId.showInTop.sure.final.tsv", sep="\t")
    df_size = len(df["query"])
    k = 1
    for line in df["query"]:
        print(k, "query from", df_size)
        k += 1
        for word in line.strip().split():
            lema_word = m.lemmatize(word)[0]
            if dictionary.get(lema_word) is None:
                params = {"key": API_KEY, "text": lema_word, "lang": "ru-en"}
                try:
                    r = requests.get("https://translate.yandex.net/api/v1.5/tr.json/translate", params=params)
                    r_json = r.json()
                    trans_word = r_json["text"][0]
                    if r_json["code"] != 200:
                        print("ERROR", r_json["code"])
                        not_translated.append(lema_word)
                        continue
                except Exception as exc:
                    print("ERROR")
                    not_translated.append(lema_word)
                    continue
                if (len(trans_word.split()) > 1):
Beispiel #52
0
def poehali(csv_input):
	'''
	Основная функция
	csv_input -- файл с таблицей ссылок
	На выходе
	|-xmlFile/
	|---------year/
	|--------------month/
	=========
	|-plain/
	|-------year/
	|------------month/
	=========
	|-html/
	|------year/
	|-----------month/
	|csv_file.csv

	'''
	data = []
	i = 0
	m = Mystem()
	gusina()
	col = ["path", "author", "sex", "birthday", "header", "created", "sphere", "genre_fi", "type", "topic", "chronotop", "style", "audience_age", "audience_level", "audience_size", "source", "publication", "publisher", "publ_year", "medium", "country", "region", "language"]
	time.sleep(3)

	path = os.getcwd()
	path = path + "/"
	csv_file = open(path + "csv_file.csv", "w")
	writer = csv.writer(csv_file,delimiter = ",")
	writer.writerow(col)


	dosugvbryanske = re.compile("^(http://www.briansk.ru/)(.+)")

	with open(csv_input) as csvfile:
		reader = csv.DictReader(csvfile)
		for row in reader:
			if re.search(dosugvbryanske, row['url']):
				print('passing on ' + str(i))

				test = urllib.request.urlopen(row['url']).read().decode('cp1251')
				file_html = path+"/"+str(i)+".html"
				file_html1 = path+"/"+str(i-1)+".html"
				dest_html = str(i)+".html"
				plain = str(i)+".txt"
				plain_new = str(i)+"_plained.txt"
				plain_stem = str(i)+"_mystem.txt"
				output_plain_stem = str(i)+"_out_mystem.txt"
				xmlFile = str(i) + ".xml"
				#dir_for_stem = "XML_STEM"

				page1_html = open(file_html, 'w')
				page1_html.write(str(test))
				page1_html.close()
				print("FILE EX: "+ str(os.path.exists(file_html)))
				pageMoving = open(file_html, 'r')
				#print(file_html + " PATH " + dest_html+"\n")
				if os.path.exists(file_html1):
					os.remove(file_html1)
					print("FILE "+str(i-1)+" HB REMOVED")
				else:
					print("FILE "+str(i-1)+" HB ALREADY MOVED")
				for line in pageMoving:
					data = re.search(r"\">[0-9]{1,2}\s{1}((янв|февр|март|апре|май|июнь|июль|авг|сентя|октяб|нояб|декаб)[а-я]{1,}\s[0-9]{4})|\">[0-9]{1,2}\s{1}(ма(а|я)\s[0-9]{4})", line)
					if data:

						'''
						Определение датирования статьи
						'''
						dates = data.group()
						dates2 = dates.split()
						year = dates2[2]
						month = dates2[1]

						create_folder(path, year, transpose_month(month), "html")
						shutil.move(file_html, path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html)
						print("FILE "+str(i)+" HB MOVED")


						'''
						Созидание директории для XML
						'''
						create_folder(path, year, transpose_month(month), "xmlFile")

						forxml = path+"xmlFile/"+year+"/"+transpose_month(month)+"/"+dest_html
						forxml_dir = path+"xmlFile/"+year+"/"+transpose_month(month)+"/"
						xml_stem = forxml_dir + str(i) + "_mystem.xml"
						rofxml = path+"xmlFile/"+year+"/"+transpose_month(month)+"/"+xmlFile

						'''
						Копирование html -> xmldir для дальнейшей обработки
						'''

						shutil.copy(path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html, forxml)
						print("FILE "+str(i)+" HB COPIED TO XML")
						openindosug_xml = open(forxml, "r")
						read_and_clean_xml = openindosug_xml.read()
						xml_data = amixml(read_and_clean_xml)
						#print(xml_data[2])
						openindosug_xml.close()
						'''
						Созидание директории для plain текста
						'''
						
						create_folder(path, year, transpose_month(month), "plain")
						forplain = path+"plain/"+year+"/"+transpose_month(month)+"/"+dest_html
						forplain_dir = path+"plain/"+year+"/"+transpose_month(month)+"/"
						shutil.copy(path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html, forplain)
						print("FILE "+str(i)+" HB COPIED TO PLAIN")
						openindosug = open(forplain, "r")

						dates = re.sub("\">", "", dates)


						'''
						wri = лист для генерации ИНФО о статьи
						'''

						wri = ["briansk.ru", str(xml_data[1]), toddmmyyy(dates), "", row['url']]


						page2_txt = open(str(forplain_dir)+str(plain), 'w')
						for datline in openindosug:
							page2_txt.write(str(make_it_clean(datline)))
						
						page2_txt.close()
						print("PLAIN FOR "+str(i)+" HB CREATED")

						'''
						Окончательная очистка plain файла; оставляем только текст статьи или текст + ИНФО
						'''
						provide_clean_file(forplain_dir+str(plain),forplain_dir+str(plain_new), wri, "extra")
						provide_clean_file(forplain_dir+str(plain),forplain_dir+str(plain_stem), wri, "mystem")
						os.remove(forplain_dir+str(plain))
						os.remove(forplain)
						openindosug.close()

						'''
						xml_data[0] -- content
						xml_data[1] -- headerTag
						xml_data[2] -- content date
						'''

						'''
						Генерация XML
						'''
						pageEtree = etree.Element('html')
						doc = etree.ElementTree(pageEtree)
						infoTag = etree.SubElement(pageEtree, "body")
						dateTag = etree.SubElement(infoTag, "h1")
						dateTag.text = str(xml_data[2])
						headerTag = etree.SubElement(infoTag, "h2")
						headerTag.text = str(xml_data[1])
						mainTag = etree.SubElement(infoTag, "h3")
						contentTag = etree.SubElement(infoTag, "h4")
						contentTag.text = str(xml_data[0])
						outFile = open(str(forxml_dir)+str(i)+".xml", 'wb')
						doc.write(outFile, xml_declaration=True, encoding='utf-16') 
						outFile.close()
						print("FILE "+str(i)+" HB CODED TO XML")

						writer.writerow([str(path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html) , "briansk.ru" , "" , "" , str(xml_data[1]) , toddmmyyy(dates), 'публицистика' , "" , "" , "категория" , "" , "нейтральный" , "н-возраст" , "н-уровень" , "городская" , str(row['url']) , "брянск.ru" , "" , str(year) , "газета" , "Россия" , "БРЯНСК" , "ru"])
						os.remove(forxml)


						input_plain = forplain_dir + plain_stem
						output_plain = forplain_dir + output_plain_stem


						'''
						pystem
						mystem 

						'''
						
						with open(input_plain) as file:
						    text = file.read()
						

						lemmas = m.lemmatize(text)
						with open(input_plain, 'w') as file:
							file.write(''.join(lemmas))

						os.system(r'/home/haniani/Загрузки/mystem -icd '+ input_plain + ' ' + output_plain)
						os.system(r'/home/haniani/Загрузки/mystem -icd --format xml '+ input_plain +' '+ xml_stem)
						

						print("MYSTEM'ed "+str(i))
						break

				i += 1
				print("PASSED ; NEXT: "+str(i)+"\n")
	csv_file.close()
	        
	for file in glob.glob(path+"*.html"):
		os.remove(file)
    def extract(self):
        try:
            # вычисляем, сколько в директории лежит файлов
            input_files = filter(lambda x: not x.endswith("~"), os.listdir(self.input_directory))
            output_data = {}
            list_of_all_terms = {}
            m = Mystem()
            # иду по документам
            for file in input_files:
                with open(self.input_directory + "/" + file) as data_file:
                    data = json.load(data_file)
                list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data["text"]))
                text = " ".join(["%s" % term for term in list_of_terms])
                list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))
                my_list_of_terms = []
                for term in list_of_terms:
                    my_term = term
                    term = u""
                    prev_letter = my_term[0]
                    term += my_term[0]
                    for i in range(1, len(my_term)):
                        if my_term[i] != prev_letter:
                            term += my_term[i]
                        prev_letter = my_term[i]
                    my_list_of_terms.append(term)
                list_of_terms = my_list_of_terms
                output_data[file] = {}
                output_data[file]["id"] = data["id"]
                output_data[file]["positive"] = data["positive"]
                output_data[file]["sarcasm"] = data["sarcasm"]
                output_data[file]["terms"] = {}
                # убираю повторяющиеся слова
                for term in list_of_terms:
                    if term not in output_data[file]["terms"]:
                        output_data[file]["terms"][term] = 1
                    else:
                        output_data[file]["terms"][term] += 1
                for term in output_data[file]["terms"].keys():
                    if term not in list_of_all_terms:
                        list_of_all_terms[term] = 1
                    else:
                        list_of_all_terms[term] += 1
                    # подсчёт tf
                    count_of_terms = output_data[file]["terms"][term]
                    output_data[file]["terms"][term] = {
                        "tf": float(count_of_terms) / len(list_of_terms),
                        "idf": 0,
                        "count": count_of_terms,
                    }

            for file in input_files:
                # подсчёт idf
                for term in output_data[file]["terms"].keys():
                    output_data[file]["terms"][term]["idf"] = math.log(
                        float(len(input_files)) / list_of_all_terms[term]
                    )
                # запись результата
                with open(self.output_directory + "/" + file + "_tf-idf", "w") as output_file:
                    json.dump(output_data[file], output_file)
        except Exception:
            return False
        else:
            return True