コード例 #1
0
ファイル: features.py プロジェクト: pombredanne/senty
def without_pronouns(directory):
    input_files = filter(lambda x: not x.endswith('~'), os.listdir(directory))
    output_data = {}
    m = Mystem()
    #иду по документам
    for input_file in input_files:
        with open(directory + '/' + input_file) as data_file:
            data = json.load(data_file)
        list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), data['text'].split(' '))
        my_list = list_of_terms
        list_of_terms = []
        for term in my_list:
            if m.analyze(term)[0].get(u'analysis'):
                if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith((u'SPRO', u'APRO')):
                    list_of_terms.append(term)
            else:
                list_of_terms.append(term)
        text_of_output = ' '.join(['%s' % term for term in list_of_terms])

        output_data[input_file] = {}
        output_data[input_file]['id'] = data['id']
        output_data[input_file]['positive'] = data['positive']
        output_data[input_file]['sarcasm'] = data['sarcasm']
        output_data[input_file]['text'] = text_of_output

        with open(directory + '/' + input_file, 'w') as output_file:
            json.dump(output_data[input_file], output_file)
コード例 #2
0
ファイル: bot.py プロジェクト: dashajarikova/homework2year
def result():
    morph = MorphAnalyzer()
    if request.args:
        sent = request.args['sentence']
        m = Mystem()
        ana = m.analyze(sent)
        new_sent = open('sentence.txt', 'w', encoding='utf-8')
        for word in ana:
            if 'analysis' in word:
                forma_slova = word['analysis'][0]['gr']
                sent2 = clear_words
                for w in sent2:
                    ana2 = m.analyze(w)
                    try:
                        an_word = ana2[0]
                        if 'analysis' in an_word:
                            print(an_word)
                            forma_slova2 = an_word['analysis'][0]['gr']
                            if forma_slova == forma_slova2:
                                new_sent.write(w + ' ')
                                break
                    except IndexError:
                        pass
        new_sent.close()
        with open('sentence.txt', 'r', encoding='utf-8') as f:
            read_sent = f.read()
        return render_template('result.html', sentence=read_sent)
    return render_template('result.html')
コード例 #3
0
def paral2():
    for sentence in st.sentences_from_text(
            text2):  #выделяем из текста2 предложение и бежим по нему
        for word in wt.tokenize(
                sentence):  #бежим по словам в выделенном тексте
            m = Mystem()
            analize = m.analyze(word)  #Морфологический анализ слова
            print(m.analyze(word))
            for i in analize:  #углубляемся в полученный словарь
                for j in i:
                    for k in i[j]:
                        for m in k:
                            if "gr" in k:
                                for o in k[m]:
                                    if "муж" and "имя" in k[
                                            m]:  #Проверяем есть ли параметры муж и имя
                                        if Names2.get(
                                                word
                                        ) is None:  #Если в словаре имен нет такого имени
                                            Names2.update({word:
                                                           1})  #добавляем его
                                        else:
                                            Names2[
                                                word] += 1  #Ищем необходимые нам параметры
                                        break  #выходим из цикла разбора анализа
コード例 #4
0
def paral1(q2, q22):  # определяем функцию с двумя аргументами - очередями
    _Kx = q22.get(
    )  # вытаскиваем список первого и последнего предложения, используемые для данного процесса, из q22
    for k in _Kx:
        xNames = text2_[k]
        for word in wt.tokenize(
                sentence):  #бежим по словам в выделенном тексте
            m = Mystem()
            analize = m.analyze(word)  #Морфологический анализ слова
            print(m.analyze(word))
            for i in analize:  #углубляемся в полученный словарь
                for j in i:
                    for k in i[j]:
                        for m in k:
                            if "gr" in k:
                                for o in k[m]:
                                    if "муж" and "имя" in k[
                                            m]:  #Проверяем есть ли параметры муж и имя
                                        if Names2.get(
                                                word
                                        ) is None:  #Если в словаре имен нет такого имени
                                            Names2.update({word:
                                                           1})  #добавляем его
                                        else:
                                            Names2[
                                                word] += 1  #Иначе инкрементируем индекс
                                        break  #выходим из цикла разбора анализа
    q2.put(Names2)  # кладем в очередь q2 получившийся словарь
コード例 #5
0
ファイル: lmtze.py プロジェクト: mannefedov/Relext
def lmtze(textfile):
    m = Mystem()
    text = open(textfile, encoding='utf-8').readlines()
    newfile = open(textfile.replace('txt', 'lem.txt'), 'w', encoding='utf-8')
    result_full = []
    for line in text:
        try:
            element = etree.fromstring(line.strip('\n'))
            text_ = element.xpath('text()')
            entities = element.xpath('*')
            result = ['<sent>']
            while text_:
                l = text_.pop(0)
                # open('temp.txt', 'w', encoding='utf-8').write(l)
                # subprocess.call(['C:\\Mystem\\mystem', 'i'])
                l = m.analyze(l)
                # print(l)
                for x in l:
                    if x.get('analysis') is not None:
                        if x.get('analysis') == []:
                            result.append(x['text'])
                        else:
                            result.append(x['analysis'][0]['lex'] + '_' + x['analysis'][0]['gr'].split(',')[0].split('=')[0])
                    else:
                        continue

                if text_:
                    e = entities.pop(0)
                    e_ = m.analyze(e.text)
                    result.append('<' + e.tag + '>')
                    for x in e_:
                        if x.get('analysis') is not None:
                            if x.get('analysis') == []:
                                result.append(x['text'])
                            else:
                                result.append(x['analysis'][0]['lex'])
                        else:
                            continue
                    result.append('</' + e.tag + '>')
        except Exception:
            continue
        result.append('</sent>')
        result_full.append(result)
        result = []
        print(len(result_full), ' разобралось')
    for sent in result_full:
        prev = ''
        for x in sent:
            if '<' in x and '/' not in x:
                newfile.write(prev + x)
                prev = ''
            elif '_' in x or x.isalpha():
                newfile.write(prev + x)
                prev = ' '
            else:
                newfile.write(x)
        newfile.write('\n')
    def extract(self):
        try:
            #вычисляем, сколько в директории лежит файлов
            input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
            output_data = {}
            list_of_all_terms = {}
            m = Mystem()
            #иду по документам
            for file in input_files:
                with open(self.input_directory + '/' + file) as data_file:
                    data = json.load(data_file)
                list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text']))
                text = " ".join(["%s" % term for term in list_of_terms])
                list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))
                my_list = list_of_terms
                list_of_terms = []
                for term in my_list:
                    if m.analyze(term)[0].get(u'analysis'):
                        if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith(self.service_parts_of_speech) and len(term) > 1:
                            list_of_terms.append(term)
                        if term == u'не':
                            list_of_terms.append(term)
                    else:
                        list_of_terms.append(term)
                output_data[file] = {}
                output_data[file]['id'] = data['id']
                output_data[file]['positive'] = data['positive']
                output_data[file]['sarcasm'] = data['sarcasm']
                output_data[file]['terms'] = {}
                #убираю повторяющиеся слова
                for term in list_of_terms:
                    if term not in output_data[file]['terms']:
                        output_data[file]['terms'][term] = 1
                    else:
                        output_data[file]['terms'][term] += 1
                for term in output_data[file]['terms'].keys():
                    if term not in list_of_all_terms:
                        list_of_all_terms[term] = 1
                    else:
                        list_of_all_terms[term] += 1
                    #подсчёт tf
                    count_of_terms = output_data[file]['terms'][term]
                    output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0,
                                                        'count': count_of_terms}

            for file in input_files:
                #подсчёт idf
                for term in output_data[file]['terms'].keys():
                    output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term])
                #запись результата
                with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
                    json.dump(output_data[file], output_file)
        except Exception:
            return False
        else:
            return True
コード例 #7
0
class PyMyStemTagger:

    def __init__(self):
        self.tagger = Mystem()

    def parse(self, sentence):
        result = self.tagger.analyze(sentence)
        print(result)
        return [(t['text'].strip(), t['analysis'][0]['gr'] if 'analysis' in t and t['analysis'] else 'NONLEX') for t in result if t['text'].strip() not in {' ',''}]

    def tag_word(self, word):
        result = self.tagger.analyze(word)
        return [(t['text'].strip(), t['analysis'][0]['gr'] if 'analysis' in t and t['analysis'] else 'NONLEX') for t in result[:1] if t['text'].strip() not in {' ',''}]
コード例 #8
0
def verbs_statistics(text):
    '''
    Выделяет из текста данные о частотности глаголов, их лемм, вида и совершенности
    :param text: текст
    :return: словари с частотностью
    '''
    m = Mystem()  # создаем экземпляр класса-анализатора
    ana = m.analyze(text)
    print(ana)
    print(len(ana))
    for i in ana:
        try:
            i['analysis']
            print(i)
        except KeyError:
            print(int())
    pos = [
        i['analysis'][0]['gr'].split('=')[0].split(',')[0] for i in ana
        if i['text'].strip() and 'analysis' in i and i['analysis']
    ]
    verbs = [
        i['analysis'][0] for i in ana
        if i['text'].strip() and 'analysis' in i and i['analysis']
        and i['analysis'][0]['gr'].split('=')[0].split(',')[0] == 'V'
    ]
    print(verbs)
    print(type(pos))
    all_pos, v, ratio = pos_counter(pos)
    lemms = lemma_counter(verbs)
    tr, intr = trans_couter(verbs)
    s, ns, amb = aspect_counter(verbs)
    return all_pos, v, ratio, lemms, tr, intr, s, ns, amb
コード例 #9
0
ファイル: ageeva_learning.py プロジェクト: Sereni/assignments
    def __init__(self, path, doc_id, limit):
        """
        :param doc_id: numerical id of a document, pass manually
        """

        self.text = open(path).read().lower().replace('\n', '.')
        # need a better regex
        self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if sentence and len(sentence.split()) > 2]
        self.pos_data = []
        self.testing_data = []
        self.id = doc_id

        m = Mystem()
        counter = Counter(DEFAULTS)

        if not limit or limit > len(self.sentences):
            limit = len(self.sentences)

        for sentence in self.sentences[:limit]:

            # parse with mystem
            data = m.analyze(sentence)

            # get POS and count for each sentence
            pos = [word.get('analysis', None)[0]['gr'].split('(')[0].split(',')[0].split('=')[0]
                   for word in data if word.get('analysis', None)]
            counter.update(pos)

            # append to dataset
            self.pos_data.append([counter[key] for key in sorted(counter)])

            # reset counter
            counter = Counter(DEFAULTS)
コード例 #10
0
def mystem_normalizer(texts, batch_size=150, mapping=mystem2upos):
    """
    Normalizer(lemmatisation and PoS tagging) with Mystem backend.
    :param texts:
    :param batch_size:
    :param mapping:
    :return:
    """
    m = Mystem()  # not very good place to store it.

    for batch_start in range(0, len(texts), batch_size):
        batch = texts[batch_start: batch_start + batch_size]
        total = ' $ '.join(batch.apply(lambda x: x.replace('\n', '').replace('$', '')))

        text = []
        for word in m.analyze(total):
            if word['text'] == '$':
                yield ' '.join(text)
                text = []
                continue
            try:
                token = word['analysis'][0]
            except (KeyError, IndexError) as e:
                continue
            text.append(pos_extractor(token, mapping=mapping))
        yield ' '.join(text)
コード例 #11
0
ファイル: project.py プロジェクト: polinadyakonova/homeworks
def index(name = None):
    if request.args:
        story = request.args['joke'] 
        mystem = Mystem()
        gramm = mystem.analyze(story)
        characters = set()
        for i in gramm:
            if (str(i).find("од=") != -1) and (str(i).find("неод=") == -1):
                s1 = str(i)[str(i).find("'lex': '") + 8:]
                characters.add(s1[:s1.find(        "'")])
        
        file = open("corp.txt", 'r', encoding = "UTF-8")
        f = file.read()[1:].split('\n\n')
        file.close()
        
        file = open("ans.txt", 'w', encoding = "UTF-8")
        for i in f:
            words = ((re.sub('[,\.\?\!\—\-\(\)\:\;]', '', i)).lower()).split(' ')
            if characters <= set(words):
                f = file.write(i + '\n\n')
        file.close()
        with open("ans.txt", "r", encoding='utf-8') as f:
                content = f.read().split('\n\n')
        return render_template("index.html", content=content)        
    return render_template('index.html')
def mystem_new(file, msg):
    bigdata = {}
    f = open(file, 'r', encoding='utf-8')
    text = f.read()
    f.close()
    capitals = [
        'Й', 'Ц', 'У', 'К', 'Е', 'Н', 'Г', 'Ш', 'Щ', 'З', 'Х', 'Ф', 'Ы', 'В',
        'А', 'П', 'Р', 'О', 'Л', 'Д', 'Ж', 'Э', 'Я', 'Ч', 'С', 'М', 'И', 'Т',
        'Б', 'Ю'
    ]
    m = Mystem()
    f1 = open(file[:-4] + '_output.txt', 'w', encoding='utf-8')
    analyse = m.analyze(text)
    pos = ['A', 'S', 'V']
    for one in analyse:
        if len(msg) > 0:
            if 'analysis' in one and len(one['analysis']) != 0 and one[
                    'analysis'][0]['gr'][0] in pos:
                if one['text'][0] not in capitals:
                    sub_dic = one['analysis']
                    for value in sub_dic:
                        if 'lex' in value:
                            short = sketch_engine(value['lex'])
                            if len(short) > 1:
                                tr_text = dic_bin_codes(short)
                                for smt in tr_text:
                                    if len(msg) > 0:
                                        if tr_text[smt] == msg[0]:
                                            print(one['text'])
                                            print(smt)
                                            one['text'] = phpmorphy(smt, one)
                                            msg.remove(msg[0])
                                            break
        f1.write(one['text'])
    f1.close()
コード例 #13
0
ファイル: utils.py プロジェクト: sosophe/Druzhba
def tag_mystem(text='Текст нужно передать функции в виде строки!'):
    m = Mystem()
    #print(text)
    text = ''.join([x for x in text.split(';') if ('итература' not in x)])
    #print(text)
    processed = m.analyze(text)
    tagged = []

    for w in processed:
        try:
            if not w["analysis"]: continue
            lemma = w["analysis"][0]["lex"].lower().strip()
            if lemma in russian_sw:
                continue
            pos = w["analysis"][0]["gr"].split(',')[0]

            pos = pos.split('=')[0].strip()

            if pos in mapping:
                tagged.append(lemma + '_' +
                              mapping[pos])  # здесь мы конвертируем тэги
            else:
                tagged.append(
                    lemma + '_X'
                )  # на случай, если попадется тэг, которого нет в маппинге

        except KeyError:
            continue
    return tagged, text
コード例 #14
0
        def post(self):
            json_data = request.get_json(force=True)
            text = json_data['text']
            m = Mystem()
            result = m.analyze(text)

            return jsonify({"analysis": result})
コード例 #15
0
def tag_mystem(mapping, text="Текст нужно передать функции в виде строки!"):
    m = Mystem()
    processed = m.analyze(text)
    tagged = []
    for w in processed:
        try:
            if w["analysis"]:
                lemma = w["analysis"][0]["lex"].lower().strip()
                pos = w["analysis"][0]["gr"].split(",")[0]
                pos = pos.split("=")[0].strip()
                #             print(lemma)
                if lemma not in set(russian_stopwords):
                    if pos in mapping:
                        tagged.append(
                            lemma + "_" +
                            mapping[pos])  # здесь мы конвертируем тэги
                    else:
                        tagged.append(
                            lemma + "_X"
                        )  # на случай, если попадется тэг, которого нет в маппинге
            else:
                continue
        except KeyError:
            continue  # я здесь пропускаю знаки препинания, но вы можете поступить по-другому
    return tagged
コード例 #16
0
def from_freq(string):
    '''
    take a word
    return [ipm, r, d]
    '''
    big_ru = {}
    start = time.time()
    with open('./Freq2011/freqrnc2011.csv') as rus:
        ru = rus.readlines()[1:]
        for line in ru:
            lemma, pos, ipm, r, d, doc = line.split('\t')
            # lp = lemma + ',' + pos
            big_ru[lemma + ',' + pos] = [ipm, r, d]
    print("dictionary: %s seconds" % (time.time() - start))
    start_time = time.time()
    mystem = Mystem()
    mystemmed = mystem.analyze(string)
    print("pymystem: %s seconds" % (time.time() - start_time))
    lemma_mystem = mystemmed[0]['analysis'][0]['lex']
    pos_mystem = mystemmed[0]['analysis'][0]['gr'].split('=')[0].split(
        ',')[0].lower()
    try:
        return big_ru[lemma_mystem + ',' + pos_mystem]
    except:
        return ['', '', '']
コード例 #17
0
ファイル: genre_by_pos.py プロジェクト: Sereni/assignments
    def __init__(self, path):

        self.text = open(path).read().lower()
        self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if len(sentence) > 1]
        self.pos_data = []

        m = Mystem()
        counter = [0, 0, 0, 0, 0]

        for sentence in self.sentences:

            # parse with mystem
            # count adjectives A, nouns S, verbs V, adverbs ADV, pronouns PR
            data = m.analyze(sentence)
            for word in data:
                analysis = word.get('analysis', None)
                if analysis:
                    best = analysis[0]
                    gr = best['gr']
                    if 'S' in gr:
                        counter[3] += 1
                    elif 'ADV' in gr:
                        counter[1] += 1
                    elif 'A' in gr:
                        counter[0] += 1
                    elif 'V' in gr:
                        counter[4] += 1
                    elif 'PR' in gr:
                        counter[2] += 1

            self.pos_data.append(counter)
            counter = [0, 0, 0, 0, 0]

        self.data = np.array(self.pos_data)
コード例 #18
0
def get_patterns():
    with open('expert_phrases.txt') as f:
        t = f.read()
    my = Mystem()
    analyz = my.analyze(t)
    fras = []
    word = []
    k = 0
    l = 0
    for i in range(0, len(analyz), 2):
        if analyz[i].get('analysis', 1) == 1:
            if analyz[i + 1].get('text').find('\n') != -1:
                fras.append(' '.join(word[k - l:k]))
                l = 0
            continue
        if analyz[i].get('analysis') != []:
            a = analyz[i].get('analysis')[0].get('gr')
            if a.find(',') != -1:
                if a.find(',') < a.find('='):
                    word.append(a[0:a.find(',')])
                    k += 1
                    l += 1
                else:
                    word.append(a[0:a.find('=')])
                    k += 1
                    l += 1
            else:
                word.append(a[0:a.find('=')])
                k += 1
                l += 1
        if analyz[i + 1].get('text').find('\n') != -1:
            fras.append(' '.join(word[k - l:k]))
            l = 0
    f = open('phrases_morphologe.txt', 'w')
    for i in range(len(fras)):
        f.write(fras[i] + '\n')
    f.close()
    print('phrases_morphologe.txt are written')
    fras.sort()
    k = 1
    paterns = []
    kol = []
    for i in range(len(fras) - 1):
        if fras[i] == fras[i + 1]:
            k += 1
        else:
            paterns.append(fras[i])
            kol.append(k)
            k = 1

    i = 0
    while i < len(paterns):
        if kol[i] == 1:
            paterns.pop(i)
            kol.pop(i)
        else:
            i += 1
    return paterns, kol
コード例 #19
0
    def normalize(self, texts, path):
        '''Normalize texts in DataFrame object'''

        rus_dict_file = path + 'rus_stop_dict.txt'
        eng_dict_file = path + 'eng_stop_dict.txt'

        try:
            with open(rus_dict_file) as f:
                russian_stopwords = [line.rstrip('\n') for line in f]
        except Exception as err:
            print(err)

        try:
            with open(eng_dict_file) as f:
                english_stopwords = [line.rstrip('\n') for line in f]
        except Exception as err:
            print(err)

        text_list = []
        mystem = Mystem()
        for text in texts:
            text = text.lower()
            text = re.sub("<!--?.*?-->", "", text)
            text = re.sub("(\\d|\\W)+", " ", text)

            tokens = mystem.lemmatize(text)

            rus_r = re.compile("[а-я]+")
            eng_r = re.compile("[a-z]+")

            rus_tokens = [w for w in filter(rus_r.match, tokens)]
            eng_tokens = [w for w in filter(eng_r.match, tokens)]

            rus_tokens = [
                token for token in rus_tokens if token not in russian_stopwords
                and token != " " and token.strip() not in punctuation
            ]

            # take only substantive:

            rus_sub_tokens = []

            for token in rus_tokens:
                try:
                    if mystem.analyze(token)[0]['analysis'][0]['gr'][0] == 'S':
                        rus_sub_tokens.append(token)
                except:
                    pass

            eng_tokens = [
                token for token in eng_tokens if token not in english_stopwords
                and token != " " and token.strip() not in punctuation
            ]

            text = " ".join(rus_sub_tokens) + ' ' + " ".join(eng_tokens)
            text_list.append(text)

        return text_list
コード例 #20
0
ファイル: new_sentiment.py プロジェクト: rob1nzon/emotions
def get_inf(text):
    w = []
    m = Mystem()
    for a in m.analyze(text):
        try:
            w.append(a['analysis'][0]['lex'])
        except:
            pass
    return w
コード例 #21
0
def make_clear_text(text):
    # текст без спец символов/цифр/пунктуации
    m = Mystem()
    clear_text = []
    lm = m.analyze(text.lower())
    for i in range(0, len(lm)):
        if 'analysis' in lm[i]:
            clear_text.append(lm[i]['text'])
    return clear_text
コード例 #22
0
ファイル: features.py プロジェクト: pombredanne/senty
def with_not(directory):
    input_files = filter(lambda x: not x.endswith('~'), os.listdir(directory))
    output_data = {}
    m = Mystem()
    #иду по документам
    for input_file in input_files:
        with open(directory + '/' + input_file) as data_file:
            data = json.load(data_file)
        list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), data['text'].split(' '))


        # обработка не + (слово)
        nums_of_bigrams = []
        helping_words = [u'совсем', u'очень', u'слишком', u'самый']
        for i in range(0, len(list_of_terms)):
            if list_of_terms[i] == u'не' and list_of_terms[i+1] not in helping_words:
                if m.analyze(list_of_terms[i+1])[0].get(u'analysis'):
                    if not m.analyze(list_of_terms[i+1])[0][u'analysis'][0][u'gr'].startswith(u'S,'):
                        nums_of_bigrams.append((i, i+1))
            elif list_of_terms[i] == u'не' and list_of_terms[i+1] in helping_words:
                if m.analyze(list_of_terms[i+2])[0].get(u'analysis'):
                    if not m.analyze(list_of_terms[i+2])[0][u'analysis'][0][u'gr'].startswith(u'S,'):
                        nums_of_bigrams.append((i, i+2))
        for i in range(0, len(nums_of_bigrams)):
            if nums_of_bigrams[i][0] + 1 == nums_of_bigrams[i][1]:
                list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + list_of_terms[nums_of_bigrams[i][1]]
                list_of_terms[nums_of_bigrams[i][1]] = ''
            elif nums_of_bigrams[i][0] + 2 == nums_of_bigrams[i][1]:
                list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + list_of_terms[nums_of_bigrams[i][1]]
                list_of_terms[nums_of_bigrams[i][1] - 1] = ''
                list_of_terms[nums_of_bigrams[i][1]] = ''
        list_of_terms = filter(lambda x: x != '', list_of_terms)


        text_of_output = ' '.join(['%s' % term for term in list_of_terms])

        output_data[input_file] = {}
        output_data[input_file]['id'] = data['id']
        output_data[input_file]['positive'] = data['positive']
        output_data[input_file]['sarcasm'] = data['sarcasm']
        output_data[input_file]['text'] = text_of_output

        with open(directory + '/' + input_file, 'w') as output_file:
            json.dump(output_data[input_file], output_file)
コード例 #23
0
def mystem(sentence):
    m = Mystem()
    mystem_lemmas = []
    lemmas = m.lemmatize(sentence)
    for lemma in lemmas:
        ana = m.analyze(lemma)
        for analysis in ana:
            if 'analysis' in analysis:
                mystem_lemmas.append(lemma)
    return mystem_lemmas
コード例 #24
0
ファイル: project.py プロジェクト: saltymon/pykili-project
def pos_analyze(text):
    m = Mystem()
    a = 'analysis'
    pos_list = []
    mystemmed = m.analyze(text)
    for record in mystemmed:
        if a in record and record[a]:
            gr = record[a][0]['gr']
            pos = re.split(',|=', gr)[0]
            pos_list.append(pos)
    return pos_list
コード例 #25
0
ファイル: new_sentiment.py プロジェクト: rob1nzon/emotions
def getA(text):
    '''Get all adv'''
    m = Mystem()
    w = []
    for a in m.analyze(text):
        try:
            atype = (a['analysis'][0]['gr'][0])
        except:
            atype = ''
        if atype == 'A':
            w.append(a['analysis'][0]['lex'])
    return w
コード例 #26
0
ファイル: prog.py プロジェクト: evgeniyamiller/python_hw
def ms(text):
    m = Mystem()
    ana = m.analyze(text)
    arr = []
    for word in ana:
        if 'analysis' in word:
            if len(word['analysis']) > 0:
                gr = word['analysis'][0]['gr']
                pos = gr.split('=')[0]
                gram = gr.split('=')[1].split('|')[0].strip('()')
                arr.append(word['text'].lower() + ' ' + pos + ',' + gr)
    return arr
コード例 #27
0
def pos_counter(text):
    '''
    Считаем части речи
    :param text: текст
    :return: словари с частотностью
    '''
    m = Mystem()  # создаем экземпляр класса-анализатора
    ana = m.analyze(text)
    pos = [i['analysis'][0]['gr'].split('=')[0].split(',')[0] for i in ana if
           i['text'].strip() and 'analysis' in i and i['analysis']]
    c_pos = Counter(pos)
    return c_pos
コード例 #28
0
class MystemTextAnalyzer(TextAnalyzer):
    def __init__(self):
        self._mystem = Mystem()

    def process(self, cas):
        src_text = cas.input_text
        infos = self._mystem.analyze(src_text)
        cas.tokens = [
            tanno for tanno in (self._extract_token_anno(i) for i in infos)
            if tanno
        ]

    _GRAMMEME_SEP_RE = re.compile('[,=|()]')
    _POS_RE = re.compile('^\w+')

    @classmethod
    def _extract_token_anno(cls, info):
        if 'analysis' in info:
            morph_arr = info['analysis']
            if morph_arr:
                morph_item = morph_arr[0]
                lemma = morph_item['lex']
                tag = morph_item['gr']
                pos = None
                if tag is not None:
                    pos = cls._extract_pos(tag)
                else:
                    # tag is None, so we should force the lower case
                    lemma = lemma.lower()
                # lemma = cls._join_lemma_pos(lemma, tag)
                token_anno = TokenAnnotation(lemma, pos)
                if tag:
                    token_anno.grammemes = set(cls._GRAMMEME_SEP_RE.split(tag))
                    if '' in token_anno.grammemes:
                        token_anno.grammemes.remove('')
                else:
                    token_anno.grammemes = set()
                return token_anno
        # in other cases: no analysis OR empty analysis results => fallback to original text
        lemma = info['text'].strip()
        if lemma:
            lemma = lemma.lower()
            token_anno = TokenAnnotation(lemma, None)
            token_anno.grammemes = set()
            return token_anno
        else:
            return None

    @classmethod
    def _extract_pos(cls, tag):
        pos_match = cls._POS_RE.search(tag)
        return pos_match.group(0) if pos_match else None
コード例 #29
0
def analyzer():
    source = set()
    part = parsing()
    for elem in part:
        source.add(elem.lower())
    mystem = Mystem()
    for item in source:
        result = mystem.analyze(item)
        info = result[0]
        verb = info['analysis']
        line = json.dumps(verb)
        if 'V' in line:
            print(item)
コード例 #30
0
def processNews(source_path, source_name, headers, start_date, end_date,
                filesFolderPathPrefix, table_path):
    for single_date in daterange(start_date, end_date):
        date = single_date.strftime("%Y/%m/%d")
        links = getLinks(source_path, headers, date)

        for link in links:
            url = source_path + link
            try:
                f = urllib.request.urlopen(url)
            except urllib.request.HTTPError as e:
                if e.code == 404:
                    continue
            html_page = f.read().decode('utf-8')
            tree = lxml.html.fromstring(html_page)

            author, title, article = getArticleInfo(tree)

            tokens = article.split()
            words = []
            for token in tokens:
                if token != "—":
                    words.append(token)
            wordcount = len(words)

            path = filesFolderPathPrefix + date[0:7]
            if not os.path.exists(path):
                os.makedirs(path)
            file = open(path + "/" + title + ".txt", "w", encoding="utf-8")
            file.write(article)
            file.close()

            m = Mystem()
            lemmas = m.lemmatize(article)
            article_lemmatized = ''.join(lemmas)
            article_analyzed = m.analyze(article)

            path2 = filesFolderPathPrefix + date[0:7] + ' ' + 'mystem'
            if not os.path.exists(path2):
                os.makedirs(path2)
            file_mystem = open(path2 + "/" + title + ".txt",
                               'w',
                               encoding="utf-8")
            file_mystem.write(article_lemmatized + '\n' + '\n' + '\n' +
                              str(article_analyzed))
            file_mystem.close()

            table.write(";".join(
                [source_name, path, author, date, title, url,
                 str(wordcount)]) + "\n")
            table.close()
コード例 #31
0
def remove_verbs(text_ru):
    m = Mystem()
    full_info = m.analyze(text_ru)
    result_text_ru = ""
    for element in full_info:
        check = 1
        if element.get('analysis') is not None:
            if len(element['analysis']) > 0:
                if element['analysis'][0]['gr'][0] == 'V':
                    check = 0

        if check == 1:
            result_text_ru += element['text']
    return result_text_ru
コード例 #32
0
def mystem_tokenize(text):
    from pymystem3 import Mystem

    global MYSTEM
    if not MYSTEM:
        MYSTEM = Mystem(
            grammar_info=False,
            entire_input=True,
            disambiguation=False,
            weight=False
        )

    data = MYSTEM.analyze(text)
    chunks = parse_mystem(data)
    return find_substrings(chunks, text)
コード例 #33
0
class MystemTokenizer:
    label = 'mystem'

    def __init__(self):
        from pymystem3 import Mystem

        self.analyzer = Mystem(grammar_info=False,
                               entire_input=True,
                               disambiguation=False,
                               weight=False)

    def __call__(self, text):
        data = self.analyzer.analyze(text)
        chunks = parse_mystem(data)
        return find_substrings(chunks, text)
コード例 #34
0
def process_mystem(words, lang):
    m = Mystem()
    analysis = m.analyze(words)

    with open(lang + '_processed.txt', 'w', encoding='utf-8') as file:
        for elem in analysis:
            if elem['text'] != ' ' and elem['text'] != '\n':
                try:
                    token = elem['text']
                    lemma = elem['analysis'][0]['lex']
                    pos_tag = elem['analysis'][0]['gr'].split(',')[0].split(
                        '=')[0]
                    info = '%s\t%s\t%s\n' % (token, lemma, pos_tag)
                    file.write(info)
                except:
                    pass
コード例 #35
0
def count_words(posts, needed_pos=None):
    '''
    Считает леммы
    :param posts: текст
    :param needed_pos: части речи, леммы которых нужно посчитать
    :type posts: str
    :type needed_pos: list
    :return: отсортированный по убыванию частотности словарь {лемма глагола : частотность в тексте}
    :rtype: dict
    '''
    m = Mystem()  # создаем экземпляр класса-анализатора
    ana = m.analyze(posts)
    words = [i['analysis'][0] for i in ana if i['text'].strip() and 'analysis' in i and i['analysis']]
    if needed_pos is not None:
        words = [i for i in words if i['gr'].split('=')[0].split(',')[0] in needed_pos]
    return lemma_counter(words)
コード例 #36
0
def pos_bi(text):
    pos_tags = []
    m = Mystem()
    sents = sent_tokenize(text)
    for sent in sents:
        sent_an = []
        analy = m.analyze(sent)
        for x in analy:
            try:
                if 'analysis' in x.keys():
                    tag = x['analysis'][0]['gr']
                    sent_an.append(re.sub(r'[=|,].*', '', tag).lower())
            except IndexError:
                pass
        pos_tags.append(sent_an)
    return pos_bi
コード例 #37
0
ファイル: learning_news.py プロジェクト: Sereni/assignments
    def build_pos(self):

        m = Mystem()
        counter = Counter(DEFAULTS)

        for doc in self.documents:

            # parse with mystem
            data = m.analyze(doc.text)

            # get POS and count for each sentence
            pos = [word.get('analysis', None)[0]['gr'].split('(')[0].split(',')[0].split('=')[0]
                   for word in data if word.get('analysis', None)]
            counter.update(pos)

            # append to dataset
            self.pos_data.append([counter[key] for key in sorted(counter)])

            # reset counter
            counter = Counter(DEFAULTS)
コード例 #38
0
ファイル: lemma.py プロジェクト: Brinit/nlp
def produce_lemmas(connection, tableName, outputTableName):
    mystem = Mystem()
    cursor = connection.cursor()
    inserter = connection.cursor()

    query = 'DELETE FROM `%s`' % outputTableName
    inserter.execute(query)
    connection.commit()

    query = 'SELECT * FROM `%s`' % tableName
    cursor.execute(query)
    query = 'INSERT INTO `' + outputTableName + '` (`' + tableName + '_id`, `word_class_id`, `lex`, `gr`)' \
            'SELECT %i, `id`, "%s", "%s" FROM `word_classes` WHERE `abbr`="%s"'
    for id, concept, scheme in cursor:
        lemmas = mystem.analyze(concept)
        for lemma in lemmas:
            for analysis in lemma.get('analysis', []):
                inserter.execute(query % prepare_content(id, analysis))
    connection.commit()

    cursor.close()
コード例 #39
0
ファイル: other.py プロジェクト: 2vitalik/collocations
def fill_mystem():
    from pymystem3 import Mystem
    m = Mystem()
    for sentence in get_sentences(1):
        lemmas = m.analyze(sentence.source)
        items = list()
        for lemma in lemmas:
            text = lemma['text']
            analysis = lemma.get('analysis')
            if not analysis:
                text = text.strip()
                if not len(text):
                    print 'spaces = "%s"' % text
                    continue
                if ' ' in text:
                    for item in re.split('\s+', text):
                        items.append("%s   %s ?" % (item, item))
                    print 'several =', "|".join(re.split('\s+', text))
                    continue
                print 'delimiter = "%s"' % text
                items.append("%s   %s ?" % (text, text))
                continue

            if not len(text.strip()):
                raise Exception('Impossible')
            if ' ' in text:
                raise Exception('Impossible')

            lexemes = list()
            for lexeme in analysis:
                print 'lex=', lexeme.get('lex', '-')
                print 'gr=', lexeme.get('gr', '-')
                lexemes.append("%s %s" % (lexeme['lex'], lexeme['gr']))
            items.append("%s   %s" % (text, '  '.join(lexemes)))
        sentence.mystem = '\n'.join(items)
        sentence.save()
コード例 #40
0
ファイル: mystem.py プロジェクト: CapitainCrunch/garbage
__author__ = 'Bogdan'
# encoding=utf-8
from pprint import pprint
from pymystem3 import Mystem
import codecs, re

mystem = Mystem()



fulltext = ''
textout = ''
f = codecs.open('1.txt', 'r', 'utf-8')
for line in f: fulltext+=line
lemmas = mystem.analyze(fulltext)
for lemm in lemmas:
    for k,v in lemm.items():
        if k == 'analysis':
            for new in v:
                for n1,n2 in new.items():
                    #print n1,n2
                    textout += n1+' '+n2
                    textout += '\r\n'
print textout

fout = codecs.open('out.txt', 'w', 'utf-8')

m = re.findall('lex\s(\w+)', textout, flags=re.U)
a = m[:999]
print len(set(a))
コード例 #41
0
class MystemOCTagger(object):
	def __init__(self):
		self.mystem_inst = Mystem()


	def run_and_convert(self, input_file, output_file, strict_match = False):
		f_in = open(input_file, 'rb')
		f_out = open(output_file, 'w+')
		context = etree.iterparse(f_in, tag='sentence')
		for event, sentence_elem in context:
			sentence = sentence_elem.find('source')
			analyzed = self.analyze_sentence(sentence.text)
			tokens_tree = sentence_elem.find('tokens')
			tokens = self.extract_tokens(tokens_tree)
			matched = self.match_analyzed_tokens(tokens, analyzed, strict_match)

			result = self.analyzed_to_csv_list(matched)
			for s in result:
				f_out.write(s+'\n')

			sentence_elem.clear()

	def analyze_sentence(self, sentence):
		return self.mystem_inst.analyze(sentence)

	# builds word-index mapping, indices sorted in order of appearance
	def extract_tokens(self, tokens_tree):
		tokens_dict = {}
		for t in tokens_tree.iter('token'):
			idx = t.get('id')
			token = t.get('text')
			token = strip_word(token)
			if (len(token) > 0):
				if token in tokens_dict:
					tokens_dict.get(token).append(idx)
				else:
					tokens_dict[token] = [idx]

		return tokens_dict


	# matches analysis with original tokens indices   
	def match_analyzed_tokens(self, tokens_index, analyzed, strict_match = False):
		analysis_indexed = {}
		unindexed = []
		for t in analyzed:
			t_text = t.get('text')
			t_text = strip_word(t_text)
			if len(t_text) > 0:
				if t_text in tokens_index:
					idx = tokens_index.get(t_text).pop(0)
					if (len(tokens_index.get(t_text)) == 0):
						tokens_index.pop(t_text)
					analysis_indexed[idx] = t.get('analysis')
				else:
					unindexed.append(t)

		if (not strict_match):
			analysis_not_strict = {}
			if len(tokens_index) > 0:
				analysis_not_strict = self.match_not_strict(tokens_index, unindexed)

			analysis_indexed.update(analysis_not_strict)

		not_analyzed = []
		if len(tokens_index) > 0:
			for t in tokens_index:
				not_analyzed.append(t)

#		if len(not_analyzed) > 0:
#			f_unindexed = open('mismatch.txt', 'a+')
#			f_unindexed.write('oc ')
#			f_unindexed.write(str(not_analyzed)+'  ')
#
#			if len(unindexed) > 0:
#				f_unindexed = open('mismatch.txt', 'a+')
#				for u in unindexed:
#					f_unindexed.write(' ')
#					f_unindexed.write(str(u.get('text')))

#			f_unindexed.write('\n')


		return analysis_indexed

	def match_not_strict(self, tokens_index, analyzed):
		analysis_indexed = {}
		for t_indexed, idx_list in tokens_index.items():
			for idx in idx_list:
				for i in range(0, len(analyzed)):
					t_analyzed = analyzed[i]
					if t_indexed.endswith(t_analyzed.get('text')):
						analysis_indexed[idx] = t_analyzed.get('analysis')
						#print(t_analyzed.get('text')+' '+t_indexed)
						analyzed.pop(i)
						idx_list.remove(idx)
						break

		idx_copy = tokens_index.copy()
		for t, i in idx_copy.items():
			if len(i) == 0:
				del tokens_index[t]


		return analysis_indexed

	def analyzed_to_csv_list(self, analyzed):
		out = []
		for idx, analysis in sorted(analyzed.items()):
			if analysis and len(analysis) > 0:
				#do we need only grammar?        
				s = str(idx) + ', ' + str(analysis[0].get('gr'))
				out.append(s)

		return out
コード例 #42
0
ファイル: genre_visualize.py プロジェクト: Sereni/assignments
    def __init__(self, path):

        self.text = open(path).read().lower()
        self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if
                          len(sentence) > 1]
        self.pos_data = []

        # compute all the things!
        # I started having fun with list comprehensions, but it quickly got out of hand...
        # if stuff's too slow, will stick everything into one for-loop.

        # length of sentences in letters
        self.sentence_lengths = [len([char for char in sentence if char not in PUNCT]) for sentence in
                                 self.sentences]

        # number of different letters in the sentence
        self.sentence_letters = [len(set(char for char in sentence if char not in PUNCT)) for sentence in
                                 self.sentences]

        # number of vowels in a sentence
        self.sentence_vowels = [len([char for char in sentence if char in VOWELS]) for sentence in self.sentences]

        # median of letters in a word
        self.median_letters = [np.median([len(word.strip(PUNCT)) for word in sentence.split()]) for sentence in
                               self.sentences]

        # median of vowels in a word
        self.median_vowels = [
            np.median([len([char for char in word if char in VOWELS]) for word in sentence.split()]) for sentence
            in self.sentences]

        # word length params
        self.sentlens = [[len(word) for word in sentence.split()] for sentence in self.sentences if len(sentence) > 1]

        m = Mystem()
        counter = [0, 0, 0, 0, 0]

        for sentence in self.sentences:

            # parse with mystem
            # count adjectives A, nouns S, verbs V, adverbs ADV, pronouns PR
            data = m.analyze(sentence)
            for word in data:
                analysis = word.get('analysis', None)
                if analysis:
                    best = analysis[0]
                    gr = best['gr']
                    if 'S' in gr:
                        counter[3] += 1
                    elif 'ADV' in gr:
                        counter[1] += 1
                    elif 'A' in gr:
                        counter[0] += 1
                    elif 'V' in gr:
                        counter[4] += 1
                    elif 'PR' in gr:
                        counter[2] += 1

            self.pos_data.append(counter)
            counter = [0, 0, 0, 0, 0]

        # and join
        self.data = np.array(list(zip(self.sentence_lengths,
                                      self.sentence_letters,
                                      self.sentence_vowels,
                                      self.median_letters,
                                      self.median_vowels,
                                      [item[0] for item in self.pos_data],
                                      [item[1] for item in self.pos_data],
                                      [item[2] for item in self.pos_data],
                                      [item[3] for item in self.pos_data],
                                      [item[4] for item in self.pos_data],
                                      [len(sentence) for sentence in self.sentlens],
                                      [np.mean(sentence) for sentence in self.sentlens],
                                      [np.median(sentence) for sentence in self.sentlens]
                                      )))
コード例 #43
0
class CsvHandler:
    INPUTFILE = 'wiki_noxml_full.txt'
    OUTPUTFILE = 'my_frequency_list.csv'

    def __init__(self):
        self.file_name = self.INPUTFILE
        self.csvlength = 0
        self.lemmatiser = Mystem()
        #self.freq_dict = {}
        self.fd = defaultdict(dict)

    def do_cprofile(func):
        def profiled_func(*args, **kwargs):
            profile = cProfile.Profile()
            try:
                profile.enable()
                result = func(*args, **kwargs)
                profile.disable()
                return result
            finally:
                profile.print_stats()
        return profiled_func

    def get_freq_dict(self, filename):

        t0 = time.time()
        print("Start freq dict")
        counter = 0
        with open(filename, 'r') as csvfile:
            datareader = csv.reader(csvfile, delimiter='\t')
            for ln, row in enumerate(datareader):
                if ln % 100 == 0: print(ln, "articles processed")
                input_text = row[2]
                counter += 1
                #if counter > 10:
                    #break
                lemmas = self.get_lem_set(input_text)

                for i,li in enumerate(lemmas):
                    self.fd[li] = 1 if li not in self.fd else self.fd[li] + 1

        t1 = time.time()
        for a,b in self.fd.items():
            print(a,b)
        print("Finished. Get input file processing time %2.2f secs, whoosh !" % (t1 - t0))

    def get_lem_set(self, text):

        return_set = set()
        for el in self.lemmatiser.analyze(text):
            analysis = el.get('analysis', None)

            if analysis:
                POS = ['A=', 'S,', 'V=']
                if (analysis[0].get('gr')[0:2] in POS) and (len(analysis[0].get('lex'))>1):
                    return_set.add(analysis[0].get('lex'))

        return return_set

    def output_dict(self, filename, output_dictionary, threshold):
        t0 = time.time()
        with open(filename, 'w', newline='', encoding="UTF-8") as csv_file:

            csv_writer = csv.writer(csv_file, dialect='excel')

            csv_writer.writerow(["First word", "Second word", "Frequency"])

            for key in output_dictionary.keys():

                if output_dictionary[key] > threshold:
                    words = key.split(':::')
                    first_word = words[0]
                    second_word = words[1]

                    csv_writer.writerow([
                        first_word,
                        second_word,
                        output_dictionary[key]
                    ])

            csv_file.flush()
            csv_file.close()
        t1 = time.time()
        print("Finished. Get output file processing time %2.2f secs, whoosh !" % (t1 - t0))


    def process(self):
        self.get_freq_dict(self.file_name)
コード例 #44
0
ファイル: denotat.py プロジェクト: mforv/Signify
def main(argv):			
		with open(argv[1], encoding='utf-8') as f:
		    s = re.sub(r'\s+', ' ', f.read(), flags=re.M)
		f=re.split(r'(?<=[.!?…]) ',s)
		sentens=[]
		for i,t in enumerate(f):
		    sentens.append(t)
		    print(str(i)," ",t)




		morph = pymorphy2.MorphAnalyzer() 

		ZnakiP=[",","!","/n",".",":",";",'"',"'","\n","...","?","!","(",")","-"," ","  "]
		t = Mystem()
		PARS=[]
		for sent in sentens:
		    input_file=open("input.txt","w",encoding="utf-8")
		    input_file.write(sent)
		    input_file.close()
		    
		    # Делаем синтаксический анализ текста, находим граматические основы
		    process = subprocess.Popen('tomitaparser.exe config.proto', stdout=subprocess.PIPE,shell=True) 
		    process.communicate()
		    process.wait()
		    
		    predicate=[]
		    Nouns=[]
		    DOP=[]
		    DOP.append({})
		    OPR=[]
		    with open("pretty.html",encoding='utf8') as fp:
		            soup = BeautifulSoup(fp,"html.parser")    
		    par_f=soup.find_all('table')
		    for table in par_f:
		        th=table.find('th')    
		        if(th.text=="Noun1"):
		            slovo=th.find_parent("table").find('a').text
		            Nouns.append(slovo)
		        if(th.text=="Verb1"):
		            slovo=th.find_parent("table").find('a').text
		            predicate.append(slovo)
		        if(th.text=="OPR1"):
		            sl=th.find_parent("table").find_all('a')
		            for slovo in sl:
		                OPR.append(slovo.text)
		        if(th.text=="DOP1"):
		            sl=th.find_parent("table").find_all('a')
		            for slovo in sl:
		                DOP[0][slovo.text.lower()]=slovo.next_element.next_element.next_element.next_element
		    TREE={}
		    TREE[Nouns[0]]={} 

		    

		    for v in predicate:
		        TREE[Nouns[0]][v]={}
		    if(OPR!=[]):
		            for temp in OPR:
		                for noun in TREE:
		                    if(len(re.split(r"[,' ']",temp))==1):
		                        TREE[Nouns[0]][temp]=t.analyze(temp)[0]['analysis'][0]['gr']
		                    else:
		                            m2=[]
		                            for f in re.split(r"[,' ']",temp):
		                                if(f!=''):
		                                    m2.append(f)
		                            if(noun in m2):
		                                mk=t.analyze(temp)
		                                wsp=[]
		                                for tr in mk:
		                                    if(not tr['text'] in ZnakiP):
		                                        if(not 'CONJ' in tr['analysis'][0]['gr']):
		                                            wsp.append(tr['text'])
		                                for tl in wsp:
		                                    if(tl!=noun):
		                                        TREE[Nouns[0]][tl]=t.analyze(tl)[0]['analysis'][0]['gr']



		    for temp in TREE[Nouns[0]]:
		        if(temp in DOP[0].values()):
		            for sp in DOP[0]:
		                if(DOP[0][sp]==temp):
		                    m2=[]
		                    for f in re.split(r"[,' ']",sp):
		                        if(f!=''):
		                            m2.append(f)                         
		                    for rg in m2:                    
		                        TREE[Nouns[0]][temp][rg]={}
		                        for _opr in OPR:
		                            reg=re.split(r"[,' ']",temp)                        
		                            if(noun in reg):
		                                mk=t.analyze(_opr)
		                                wsp=[]
		                                for tr in mk:
		                                    if(not tr['text'] in ZnakiP):
		                                        if(not 'CONJ' in tr['analysis'][0]['gr']):
		                                            wsp.append(tr['text'])
		                                for tl in wsp:
		                                    if(tl!=rg):                                
		                                        TREE[Nouns[0]][temp][rg][tl]=t.analyze(tl)[0]['analysis'][0]['gr']


		  
		    
		    for noun in TREE:
		        d1=[noun]
		        for verb in TREE[noun]:
		            if(morph.parse(verb)[0].tag.POS=='ADJF'):            
		                d2=[noun,'быть']
		                d2.append(verb)
		                if(not d2 in PARS):
		                    PARS.append(d2.copy()) 
		                d2.pop()
		            else:
		                d4=[verb,"может быть"]
		                d1.append(verb)            
		                for temp in TREE[noun][verb]:            
		                            if(morph.parse(temp)[0].tag.POS=='NOUN'):
		                                d1.append(morph.parse(temp)[0].normal_form)
		                                if(not d1 in PARS):
		                                        PARS.append(d1.copy())
		                                d1.pop()
		                                d3=[temp,'быть']    
		                                
		                                for temp2 in TREE[noun][verb][temp]:
		                                        d3.append(temp2)
		                                        PARS.append(d3.copy())
		                                        d3.pop()
		                            else:
		                                d4.append(temp)
		                                if(not d4 in PARS):
		                                    PARS.append(d4.copy())
		                                d4.pop()


		    
		obj = PARS.copy()

		g1=gv.Digraph(format='png')

		for temp in obj:
		    a=morph.parse(temp[0])[0].tag.POS
		    if(a=='VERB' or a=='INFN'):
		            for t in obj:
		                if(t[1]==temp[0]):
		                    g1.node(t[0],shape='rect',style='filled',fillcolor='#cccccc')
		                    g1.node(temp[0])
		                    g1.node(temp[2],shape='rect',style='filled',fillcolor='#cccccc')
		                    g1.edge(t[0],temp[0])                    
		                    g1.edge(temp[0],temp[2],label=temp[1])
		                    g1.edge(temp[0],t[2])
		                    
		    else:
		        g1.node(temp[0],shape='rect',style='filled',fillcolor='#cccccc')
		        g1.node(temp[2],shape='rect',style='filled',fillcolor='#cccccc')
		        g1.edge(temp[0],temp[2],label=temp[1])

		print(g1.source)
		g1.render('img/'+argv[2])
コード例 #45
0
import os, json, dicttoxml
from pymystem3 import Mystem

m = Mystem()
top = 'C:\\Users\\John\\Desktop\\py_files\\питон\\korpus\\no_marks'
for root, dirs, files in os.walk(top):
    for name in files:
        loc = os.path.join(root, name)
        loc_list = loc.split('\\')  #creates list in order to remove path content
        new_root = loc.replace('\\no_marks\\{0}\\{1}\\{2}'.format(loc_list[8], loc_list[9], loc_list[10]), '') #removes path ending
        dir_marks = os.path.join(new_root + '\\marks\\{0}\\{1}'.format(loc_list[8], loc_list[9]))   #adds new path ending for json.docs
        dir_xml = os.path.join(new_root + '\\xml\\{0}\\{1}'.format(loc_list[8], loc_list[9]))       #adds new path ending for xml docs
        new_name = name.replace('.txt', '')
        if not os.path.exists(dir_marks):   #makes nesessary dirs if not present
            os.makedirs(dir_marks)
        if not os.path.exists(dir_xml):
            os.makedirs(dir_xml)
        with open(loc, "r", encoding = 'utf-8') as doc:
            text_doc = doc.read()
            lines = doc.readlines()
            info = json.dumps(m.analyze(text_doc), ensure_ascii = False)  #creates text file with gram and lem info
        with open("{0}\\{1}.json".format(dir_marks, new_name), 'w', encoding = 'utf-8') as doc_marks:
            doc_marks.write(info)
        xml = dicttoxml.dicttoxml(info).decode('utf-8')     #converts json to xml
        with open("{0}\\{1}.xml".format(dir_xml, new_name), 'w', encoding = 'utf-8') as doc_xml:
            doc_xml.write(xml)


コード例 #46
0
ファイル: _helloworld.py プロジェクト: 2vitalik/collocations
# coding: utf-8

from pymystem3 import Mystem
# text = "Голкипер «Нью-Йорк Айлендерс» а-б г-н ваыва-ыфвафыа Выступая на пресс-конференции в Лондоне, он подчеркнул, что опубликованные необработанные отчеты с мест боевых действий не содержат имен или информации, которая может повредить каким-либо лицам или организациям. Красивая, — 123.2 latin мама 4,5 7:8 красиво мыла раму"
text = "слив воды"
m = Mystem()
# lemmas = m.lemmatize(text)
# print(''.join(lemmas))
lemmas = m.analyze(text)
for lemma in lemmas:
    print '#"%s"' % lemma['text']
    a = lemma.get('analysis')
    # print a
    if a:
        for b in a:
            print 'lex=', b.get('lex', '-')
            print 'gr=', b.get('gr', '-')
    print
コード例 #47
0
class CsvHandler:
    #INPUTFILE = 'corpus-ru-dbpedia-short-dea.csv'
    #INPUTFILE = 'wiki-one-line.txt'
    INPUTFILE = 'wiki_noxml_full.txt'
    OUTPUTFILE = 'my_output-large.csv'

    def __init__(self):
        self.file_name = self.INPUTFILE
        self.csvlength = 0
        self.lemmatiser = Mystem()
        #self.freq_dict = {}
        self.fd = defaultdict(dict)

    def do_cprofile(func):
        def profiled_func(*args, **kwargs):
            profile = cProfile.Profile()
            try:
                profile.enable()
                result = func(*args, **kwargs)
                profile.disable()
                return result
            finally:
                profile.print_stats()
        return profiled_func

    def get_freq_dict(self, filename):

        t0 = time.time()
        counter = 0
        with open(filename, 'r') as csvfile:
            datareader = csv.reader(csvfile, delimiter='\t')
            for ln, row in enumerate(datareader):
                if ln % 100 == 0: print(ln, "articles processed")

                input_text = row[2]
                counter += 1
                #if counter > 100:
                    #break
                lemmas = self.get_lem_set(input_text)

                for i, li in enumerate(lemmas):
                    for j, lj in enumerate(lemmas):
                        if i < j:
                            self.fd[li][lj] = 1 if lj not in self.fd[li] else self.fd[li][lj] + 1
                            #key = li + ":::" + lj
                            #if self.freq_dict.get(key, None):
                            #    self.freq_dict[key] += 1
                            #else:
                            #    self.freq_dict[key] = 1
        t1 = time.time()
        for a in self.fd:
            for b in self.fd[a]:
                print(a, b, self.fd[a][b])
        print("Finished. Get input file processing time %2.2f secs, whoosh !" % (t1 - t0))

    #@do_cprofile
    #def sort_dict(self):
    #    return OrderedDict(sorted(self.freq_dict.items(), key=lambda t: t[1], reverse=True))


    def get_lem_set(self, text):

        return_set = set()
        for el in self.lemmatiser.analyze(text):
            analysis = el.get('analysis', None)

            if analysis:
                POS = ['A=', 'S,', 'V=']
                if (analysis[0].get('gr')[0:2] in POS) and (len(analysis[0].get('lex'))>1):
                    return_set.add(analysis[0].get('lex'))

        '''
        for el in self.lemmatiser.lemmatize(text):
            el = el.strip()
            if (el not in punctuation) and (not el.isdigit()):
                return_set.add(el)
        '''

        return return_set

    def output_dict(self, filename, output_dictionary, threshold):
        t0 = time.time()
        with open(filename, 'w', newline='', encoding="UTF-8") as csv_file:

            csv_writer = csv.writer(csv_file, dialect='excel')

            csv_writer.writerow(["First word", "Second word", "Frequency"])

            for key in output_dictionary.keys():

                if output_dictionary[key] > threshold:
                    words = key.split(':::')
                    first_word = words[0]
                    second_word = words[1]

                    csv_writer.writerow([
                        first_word,
                        second_word,
                        output_dictionary[key]
                    ])

            csv_file.flush()
            csv_file.close()
        t1 = time.time()
        print("Finished. Get output file processing time %2.2f secs, whoosh !" % (t1 - t0))


    def process(self):
        self.get_freq_dict(self.file_name)
コード例 #48
0
ファイル: data_preparation.py プロジェクト: alexeyev/nm
    if options:
        title = options.group(1)
        for stuff in title.split('|'):
            yield gr.replace("(" + title + ")", stuff)
    else:
        yield gr


lines = set([])

with open("data/test.txt", "r") as input_file:
    logging.info("file opened")

    for line in input_file:
        for w in m.analyze(line):

            if 'analysis' in w:
                for item in w['analysis']:
                    for gramm_info in parse_gr(item['gr']):
                        lines.add("\t".join(
                            [gramm_info, item['lex'], w['text'].lower()]).encode("utf-8") + "\n")

with open("data/pairs_with_grammar.tsv", "w+") as f:
    for line in lines:
        f.write(line)

dict = {}

for line in open("data/pairs_with_grammar.tsv", "r+"):
    if line.strip():
コード例 #49
0
ファイル: wiki.py プロジェクト: Digsolab/russe-evaluation
output_fpath = sys.argv[1] if len(sys.argv) > 1 else "output.txt"

tic = time()
with codecs.open(output_fpath, "w", "utf-8") as output:
    m = Mystem()
    i = 0 
    for line in sys.stdin:
        try: 
            i += 1
            if i % 1000 == 0: print i
            f = line.split("\t")
            url, title, text = f[0], f[1], ' '.join(f[2:])
     
            print >> output, "<doc url='%s' title='%s'>" % (url, title.decode("utf-8"))
            res = m.analyze(text)
            for r in res:
                if "analysis" not in r or "text" not in r: continue
     
                if len(r["analysis"]) < 1 or "lex" not in r["analysis"][0] or "gr" not in r["analysis"][0]:
                    print >> output, "%s\t%s\t%s" % (r["text"], r["text"], "?") 
                else:
                    pos = re.split('=|,', r["analysis"][0]["gr"])[0] 
                    print >> output, "%s\t%s\t%s" % (r["text"], r["analysis"][0]["lex"], pos)
            print >> output, "</doc>"
        except:
            print "Bad line: '%s'" % line
            print "Error:", traceback.format_exc()
            print "Fields num:", len(line.split("\t"))
print "Elapsed:", time() - tic, "sec."