Beispiel #1
0
def index(name = None):
    if request.args:
        story = request.args['joke'] 
        mystem = Mystem()
        gramm = mystem.analyze(story)
        characters = set()
        for i in gramm:
            if (str(i).find("од=") != -1) and (str(i).find("неод=") == -1):
                s1 = str(i)[str(i).find("'lex': '") + 8:]
                characters.add(s1[:s1.find(        "'")])
        
        file = open("corp.txt", 'r', encoding = "UTF-8")
        f = file.read()[1:].split('\n\n')
        file.close()
        
        file = open("ans.txt", 'w', encoding = "UTF-8")
        for i in f:
            words = ((re.sub('[,\.\?\!\—\-\(\)\:\;]', '', i)).lower()).split(' ')
            if characters <= set(words):
                f = file.write(i + '\n\n')
        file.close()
        with open("ans.txt", "r", encoding='utf-8') as f:
                content = f.read().split('\n\n')
        return render_template("index.html", content=content)        
    return render_template('index.html')
Beispiel #2
0
def without_pronouns(directory):
    input_files = filter(lambda x: not x.endswith('~'), os.listdir(directory))
    output_data = {}
    m = Mystem()
    #иду по документам
    for input_file in input_files:
        with open(directory + '/' + input_file) as data_file:
            data = json.load(data_file)
        list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), data['text'].split(' '))
        my_list = list_of_terms
        list_of_terms = []
        for term in my_list:
            if m.analyze(term)[0].get(u'analysis'):
                if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith((u'SPRO', u'APRO')):
                    list_of_terms.append(term)
            else:
                list_of_terms.append(term)
        text_of_output = ' '.join(['%s' % term for term in list_of_terms])

        output_data[input_file] = {}
        output_data[input_file]['id'] = data['id']
        output_data[input_file]['positive'] = data['positive']
        output_data[input_file]['sarcasm'] = data['sarcasm']
        output_data[input_file]['text'] = text_of_output

        with open(directory + '/' + input_file, 'w') as output_file:
            json.dump(output_data[input_file], output_file)
Beispiel #3
0
def mystem_using_with_considering_of_multiple_letters(input_directory, output_directory):
        input_files = filter(lambda x: not x.endswith('~'), os.listdir(input_directory))
        output_data = {}
        m = Mystem()
        #иду по документам
        for input_file in input_files:
            with open(input_directory + '/' + input_file) as data_file:
                data = json.load(data_file)
            list_of_terms = filter(lambda x: x != '', re.split(''' |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +''', data['text']))
            my_list_of_terms = []
            for term in list_of_terms:
                if term == m.lemmatize(term)[0]:
                    my_term = term
                    term = u''
                    prev_letter = my_term[0]
                    term += my_term[0]
                    for i in range(1, len(my_term)):
                        if my_term[i] != prev_letter:
                            term += my_term[i]
                        prev_letter = my_term[i]
                    my_list_of_terms.append(term)
                else:
                    my_list_of_terms.append(term)
            list_of_terms = my_list_of_terms
            text = ' '.join(['%s' % term for term in list_of_terms])
            list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), m.lemmatize(text))
            text_of_output = ' '.join(['%s' % term for term in list_of_terms])
            output_data[input_file] = {}
            output_data[input_file]['id'] = data['id']
            output_data[input_file]['positive'] = data['positive']
            output_data[input_file]['sarcasm'] = data['sarcasm']
            output_data[input_file]['text'] = text_of_output
            with open(output_directory + '/' + input_file, 'w') as output_file:
                json.dump(output_data[input_file], output_file)
Beispiel #4
0
    def __init__(self, path, doc_id, limit):
        """
        :param doc_id: numerical id of a document, pass manually
        """

        self.text = open(path).read().lower().replace('\n', '.')
        # need a better regex
        self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if sentence and len(sentence.split()) > 2]
        self.pos_data = []
        self.testing_data = []
        self.id = doc_id

        m = Mystem()
        counter = Counter(DEFAULTS)

        if not limit or limit > len(self.sentences):
            limit = len(self.sentences)

        for sentence in self.sentences[:limit]:

            # parse with mystem
            data = m.analyze(sentence)

            # get POS and count for each sentence
            pos = [word.get('analysis', None)[0]['gr'].split('(')[0].split(',')[0].split('=')[0]
                   for word in data if word.get('analysis', None)]
            counter.update(pos)

            # append to dataset
            self.pos_data.append([counter[key] for key in sorted(counter)])

            # reset counter
            counter = Counter(DEFAULTS)
Beispiel #5
0
    def __init__(self, path):

        self.text = open(path).read().lower()
        self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if len(sentence) > 1]
        self.pos_data = []

        m = Mystem()
        counter = [0, 0, 0, 0, 0]

        for sentence in self.sentences:

            # parse with mystem
            # count adjectives A, nouns S, verbs V, adverbs ADV, pronouns PR
            data = m.analyze(sentence)
            for word in data:
                analysis = word.get('analysis', None)
                if analysis:
                    best = analysis[0]
                    gr = best['gr']
                    if 'S' in gr:
                        counter[3] += 1
                    elif 'ADV' in gr:
                        counter[1] += 1
                    elif 'A' in gr:
                        counter[0] += 1
                    elif 'V' in gr:
                        counter[4] += 1
                    elif 'PR' in gr:
                        counter[2] += 1

            self.pos_data.append(counter)
            counter = [0, 0, 0, 0, 0]

        self.data = np.array(self.pos_data)
    def extract(self):
        try:
            #вычисляем, сколько в директории лежит файлов
            input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
            output_data = {}
            list_of_all_terms = {}
            m = Mystem()
            #иду по документам
            for file in input_files:
                with open(self.input_directory + '/' + file) as data_file:
                    data = json.load(data_file)
                list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|<|>|\*|!|@|_ +""", data['text']))
                text = " ".join(["%s" % term for term in list_of_terms])
                list_of_terms = filter(lambda x: x != " ", m.lemmatize(text))
                count_of_rows = 0
                for i in range(0, len(list_of_terms)):
                    if list_of_terms[i] == '\n' or list_of_terms[i] == ' \n':
                        count_of_rows += 1
                    if list_of_terms[i] == ' \n':
                        list_of_terms[i] = '\n'
                if count_of_rows < self.threshold_of_rows_count:
                    first_list_of_terms = list_of_terms
                    list_of_terms = []
                    for i in range(0, len(first_list_of_terms)):
                        if first_list_of_terms[i] != '\n':
                            list_of_terms.append(first_list_of_terms[i])
                output_data[file] = {}
                output_data[file]['id'] = data['id']
                output_data[file]['positive'] = data['positive']
                output_data[file]['sarcasm'] = data['sarcasm']
                output_data[file]['terms'] = {}
                #убираю повторяющиеся слова
                for term in list_of_terms:
                    if term not in output_data[file]['terms']:
                        output_data[file]['terms'][term] = 1
                    else:
                        output_data[file]['terms'][term] += 1
                for term in output_data[file]['terms'].keys():
                    if term not in list_of_all_terms:
                        list_of_all_terms[term] = 1
                    else:
                        list_of_all_terms[term] += 1
                    #подсчёт tf
                    count_of_terms = output_data[file]['terms'][term]
                    output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0,
                                                        'count': count_of_terms}

            for file in input_files:
                #подсчёт idf
                for term in output_data[file]['terms'].keys():
                    output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term])
                #запись результата
                with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
                    json.dump(output_data[file], output_file)
        except Exception:
            return False
        else:
            return True
Beispiel #7
0
def lmtze(textfile):
    m = Mystem()
    text = open(textfile, encoding='utf-8').readlines()
    newfile = open(textfile.replace('txt', 'lem.txt'), 'w', encoding='utf-8')
    result_full = []
    for line in text:
        try:
            element = etree.fromstring(line.strip('\n'))
            text_ = element.xpath('text()')
            entities = element.xpath('*')
            result = ['<sent>']
            while text_:
                l = text_.pop(0)
                # open('temp.txt', 'w', encoding='utf-8').write(l)
                # subprocess.call(['C:\\Mystem\\mystem', 'i'])
                l = m.analyze(l)
                # print(l)
                for x in l:
                    if x.get('analysis') is not None:
                        if x.get('analysis') == []:
                            result.append(x['text'])
                        else:
                            result.append(x['analysis'][0]['lex'] + '_' + x['analysis'][0]['gr'].split(',')[0].split('=')[0])
                    else:
                        continue

                if text_:
                    e = entities.pop(0)
                    e_ = m.analyze(e.text)
                    result.append('<' + e.tag + '>')
                    for x in e_:
                        if x.get('analysis') is not None:
                            if x.get('analysis') == []:
                                result.append(x['text'])
                            else:
                                result.append(x['analysis'][0]['lex'])
                        else:
                            continue
                    result.append('</' + e.tag + '>')
        except Exception:
            continue
        result.append('</sent>')
        result_full.append(result)
        result = []
        print(len(result_full), ' разобралось')
    for sent in result_full:
        prev = ''
        for x in sent:
            if '<' in x and '/' not in x:
                newfile.write(prev + x)
                prev = ''
            elif '_' in x or x.isalpha():
                newfile.write(prev + x)
                prev = ' '
            else:
                newfile.write(x)
        newfile.write('\n')
    def extract(self):
        try:
            #вычисляем, сколько в директории лежит файлов
            input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
            output_data = {}
            list_of_all_terms = {}
            m = Mystem()
            #иду по документам
            for file in input_files:
                with open(self.input_directory + '/' + file) as data_file:
                    data = json.load(data_file)
                list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text']))
                text = " ".join(["%s" % term for term in list_of_terms])
                list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))
                my_list = list_of_terms
                list_of_terms = []
                for term in my_list:
                    if m.analyze(term)[0].get(u'analysis'):
                        if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith(self.service_parts_of_speech) and len(term) > 1:
                            list_of_terms.append(term)
                        if term == u'не':
                            list_of_terms.append(term)
                    else:
                        list_of_terms.append(term)
                output_data[file] = {}
                output_data[file]['id'] = data['id']
                output_data[file]['positive'] = data['positive']
                output_data[file]['sarcasm'] = data['sarcasm']
                output_data[file]['terms'] = {}
                #убираю повторяющиеся слова
                for term in list_of_terms:
                    if term not in output_data[file]['terms']:
                        output_data[file]['terms'][term] = 1
                    else:
                        output_data[file]['terms'][term] += 1
                for term in output_data[file]['terms'].keys():
                    if term not in list_of_all_terms:
                        list_of_all_terms[term] = 1
                    else:
                        list_of_all_terms[term] += 1
                    #подсчёт tf
                    count_of_terms = output_data[file]['terms'][term]
                    output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0,
                                                        'count': count_of_terms}

            for file in input_files:
                #подсчёт idf
                for term in output_data[file]['terms'].keys():
                    output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term])
                #запись результата
                with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
                    json.dump(output_data[file], output_file)
        except Exception:
            return False
        else:
            return True
 def extract(self):
     try:
         #вычисляем, сколько в директории лежит файлов
         input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
         output_data = {}
         list_of_all_n_grams = {}
         m = Mystem()
         #иду по документам
         for file in input_files:
             with open(self.input_directory + '/' + file) as data_file:
                 data = json.load(data_file)
             list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text']))
             text = " ".join(["%s" % term for term in list_of_terms])
             list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))
             list_of_n_grams_tuples = {}
             for j in range(0, self.n):
                 list_of_n_grams_tuples[j] = zip(*[list_of_terms[i:] for i in range(j + 1)])
             list_of_n_grams_strings = []
             for j in range(0, self.n):
                 for gram_tuple in list_of_n_grams_tuples[j]:
                     string_of_n_gram = " ".join(["%s" % term for term in gram_tuple])
                     list_of_n_grams_strings.append(string_of_n_gram)
             output_data[file] = {}
             output_data[file]['id'] = data['id']
             output_data[file]['positive'] = data['positive']
             output_data[file]['sarcasm'] = data['sarcasm']
             output_data[file]['terms'] = {}
             #убираю повторяющиеся слова
             for gram in list_of_n_grams_strings:
                 if gram not in output_data[file]['terms']:
                     output_data[file]['terms'][gram] = 1
                 else:
                     output_data[file]['terms'][gram] += 1
             for gram in output_data[file]['terms'].keys():
                 if gram not in list_of_all_n_grams:
                     list_of_all_n_grams[gram] = 1
                 else:
                     list_of_all_n_grams[gram] += 1
                 #подсчёт tf
                 count_of_n_grams = output_data[file]['terms'][gram]
                 output_data[file]['terms'][gram] = {'tf': float(count_of_n_grams)/len(list_of_n_grams_strings), 'idf': 0,
                                                     'count': float(count_of_n_grams)}
         for file in input_files:
             #подсчёт idf
             for gram in output_data[file]['terms'].keys():
                 output_data[file]['terms'][gram]['idf'] = math.log(float(len(input_files))/list_of_all_n_grams[gram])
             #запись результата
             with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
                 json.dump(output_data[file], output_file)
     except Exception:
         return False
     else:
         return True
Beispiel #10
0
def preprocess_corpus(id_data, text_data, lemmatization):
    print("Preprocessing the text corpus ...")

    reg = re.compile('[^a-z^A-Z^0-9^А-я^\s*]')

    descriptions = []
    item_ids = []
    rows_with_nan_desc = []

    id_data = id_data.tolist()
    if lemmatization == True:
        mystem = Mystem()

    print("Lemmatization: " + str(lemmatization))
    for i, descrption in tqdm(enumerate(text_data)):
        if lemmatization == True:
            temp_line = preprocess_line(descrption, reg, mystem)
        else:
            temp_line = preprocess_line(descrption, reg)

        if len(temp_line) > 0:
            descriptions.append(temp_line)
            item_ids.append(int(id_data[i]))
        else:
            rows_with_nan_desc.append(i)

    d = {}
    d['itemID'] = item_ids
    d['descriptions'] = descriptions

    d = pd.DataFrame(data=d)
    print("Number of description that are empty after preprocessing: " +
          str(len(rows_with_nan_desc)))

    return d, rows_with_nan_desc
Beispiel #11
0
def preprocess_sent(text):
    mystem = Mystem()
    russian_stopwords = stopwords.words("russian")
    letter = re.compile(r'[А-Яа-я]+')

    tokens = mystem.lemmatize(text.lower())
    tokens = [
        token for token in tokens
        if letter.findall(token) and letter.findall(token)[0] == token
        and token not in russian_stopwords
    ]

    text = " ".join(tokens)

    vectors = new_preproc.preproc_texts([text])
    return vectors
def df_from_preproc():
    """
    Preprocessing data and creation of a Dataframe for LDA model
    """
    result = get_data_gp()
    reviews_list = []
    date = []
    for i in range(len(result)):
        if result[i]["content"]:
            reviews_list.append(result[i]["content"])
            date.append(result[i]["at"])

    df = pd.DataFrame(data={"date": date, "text": reviews_list})

    df.date = pd.to_datetime(df.date).dt.normalize()

    try:
        mystem = Mystem()
    except FileExistsError:
        print("Dierctory exists")

    df["text_preproc"] = df.text.apply(preprocess_text_rus, mystem=mystem)

    df = df[df["text_preproc"].apply(len) > 2].reset_index(drop=True)

    return df
Beispiel #13
0
class Lemmatizer(BaseProcessor):
    def __init__(self):
        self.m = Mystem()

    def transform(self, tokens, *args):
        lemm_str = " ".join(tokens)
        return list(filter(lambda s: s.strip(), self.m.lemmatize(lemm_str)))
Beispiel #14
0
class HHParser:
    def __init__(self) -> None:
        self.mystem = Mystem()
        self.term_extractor = rutermextract.TermExtractor()
        self.russian_stopwords = stopwords.words("russian")
        with open(
                os.path.dirname(os.path.realpath(__file__)) + '/models.json',
                'rb') as file:
            self.models = dict(json.load(file))
        nltk.download("stopwords")

    def preprocess_text(self, text: str, word_limit: int):
        tokens = self.mystem.lemmatize(text.lower())
        tokens = [token.split(" ") for token in tokens]
        tokens = np.concatenate(tokens)
        tokens = [token.strip() for token in tokens if token not in self.russian_stopwords \
                  and token != " " \
                  and token.strip() not in punctuation]
        text = " ".join(tokens)
        terms = self.term_extractor(text, limit=word_limit, strings=True)

        return terms

    def answer_questions(self, uid: str, questions: List[str]):
        answers = {}
        for question in questions:
            question_terms = self.preprocess_text(question, 2)
            answer = parsehh(uid, question_terms=question_terms)
            if answer is not None and answer is not {}:
                answers[question] = answer
        return answers
Beispiel #15
0
def init_model():
    """Init Word2Vec model."""

    logging.info("Loading model '%s' ..." % config.MODEL_NAME)

    global model
    model = gensim.downloader.load(config.MODEL_NAME)
    logging.info("Model is loaded.")

    global topics
    topics = utils.load_topics()
    logging.info("Topics: %s." % topics)

    global tags_model
    standard_library.install_aliases()

    # Таблица преобразования частеречных тэгов Mystem в тэги UPoS:
    mapping_url = 'https://raw.githubusercontent.com/akutuzov/universal-pos-tags/4653e8a9154e93fe2f417c7fdb7a357b7d6ce333/ru-rnc.map'

    global tag_mapping
    mystem2upos = {}
    r = requests.get(mapping_url, stream=True)
    for pair in r.text.split('\n'):
        pair = pair.split()
        if len(pair) > 1:
            mystem2upos[pair[0]] = pair[1]

    tag_mapping = mystem2upos

    logging.info('Loading the tags model ...')
    tags_model = Mystem()
Beispiel #16
0
def text_analysis(texts):
    token_texts, part_texts = [], []
    token_text, part_text = [], []
    token_sent, part_sent = [], []
    tokens = Mystem().analyze(texts)
    tokens = list(filter(lambda t: t != {"text": " "} and t != {"text": "-"}, tokens))[:-1]
    for token in tokens:
        if token['text'] == 'sent':
            token_text.append(token_sent)
            part_text.append(part_sent)
            token_sent, part_sent = [], []
        else:
            if token['text'] == 'stop':
                token_text.append(token_sent)
                part_text.append(part_sent)
                token_texts.append(token_text)
                part_texts.append(part_text)
                token_sent, part_sent = [], []
                token_text, part_text = [], []
            else:
                try:
                    if token['analysis'][0]['lex'] not in stopwords.words("russian"):
                        token_sent.append(token['analysis'][0]['lex'])
                        part_sent.append(re.split(r'[,=]', token['analysis'][0]['gr'])[0])
                except IndexError:
                    token_sent.append(token['text'])
                    part_sent.append('DL')
                except KeyError:
                    pass
    return token_texts, part_texts
Beispiel #17
0
def get_mystem():
    for mys in mystems:
        if not mys['in_use']:
            return mys['mystem']
    else:
        new_mys = Mystem()
        return new_mys
Beispiel #18
0
class NewsTextDataset:
    def __init__(self):
        self.data = []
        self.unique_ids = []
        self.mystem = Mystem()
        self.russian_stopwords = stopwords.words("russian")

    def append(self, article: Article):
        if article.article_id not in self.unique_ids:
            self.unique_ids.append(article.article_id)
            self.data.append(article)
            return True
        else:
            return False

    def save(self, path):
        with open(path, "w") as fp:
            data = {
                "catalog": [ob.__dict__ for ob in self.data],
            }
            json.dump(
                data,
                fp,
                sort_keys=True,
                indent=4,
                ensure_ascii=False,
            )

    def load(self, path):
        with open(path) as json_file:
            data = json.load(json_file)
        self.data = [Article(dict_object=obj) for obj in data["catalog"]]

    def preprocess(self):
        for idx, article in tqdm(enumerate(self.data)):
            # r"[a-zA-Z]|\$|\d*|\(|\)|/@"
            pattern = r"[^а-яА-Я\s]"
            text = re.sub(pattern, "", article.text)
            tokens = self.mystem.lemmatize(text.lower())
            tokens = [
                token for token in tokens
                if token not in self.russian_stopwords and token != " "
                and token.strip() not in punctuation and ad.is_cyrillic(token)
            ]
            article.tokenized_text = tokens
            self.update(article, idx)

    def dump_to_pandas(self):
        return pd.DataFrame.from_records(
            [article.to_dict() for article in self.data], )

    def __len__(self):
        return len(self.data)

    def update(self, article, idx):
        self.data[idx].tokenized_text = article.tokenized_text

    # Useless for now
    def __getitem__(self, idx):
        return self.data[idx]
Beispiel #19
0
def process_mystem(words, lang):
    m = Mystem()
    analysis = m.analyze(words)

    with open(lang + '_processed.txt', 'w', encoding='utf-8') as file:
        for elem in analysis:
            if elem['text'] != ' ' and elem['text'] != '\n':
                try:
                    token = elem['text']
                    lemma = elem['analysis'][0]['lex']
                    pos_tag = elem['analysis'][0]['gr'].split(',')[0].split(
                        '=')[0]
                    info = '%s\t%s\t%s\n' % (token, lemma, pos_tag)
                    file.write(info)
                except:
                    pass
Beispiel #20
0
class Word2vecProcessor(object):
    """Объект для работы с моделью word2vec сходства слов"""
    def __init__(self, w2v_model_file):
        self.mystem = Mystem()
        self.word2vec = KeyedVectors.load_word2vec_format(w2v_model_file,
                                                          binary=True)
        self.lemma2word = {
            word.split('_')[0]: word
            for word in self.word2vec.index2word
        }

    def word_vector(self, word):
        lemma = self.mystem.lemmatize(word)[0]
        word = self.lemma2word.get(lemma)
        return self.word2vec[word] if word in self.word2vec else None

    def text_vector(self, text):
        """Вектор текста, получается путем усреднения векторов всех слов в тексте"""
        word_vectors = [
            self.word_vector(token) for token in word_tokenize(text.lower())
            if token.isalpha()
        ]
        word_vectors = [vec for vec in word_vectors if vec is not None]
        return np.mean(word_vectors, axis=0)

    def distance(self, vec1, vec2):
        if vec1 is None or vec2 is None:
            return 2
        return cosine(vec1, vec2)
Beispiel #21
0
class ActionDocs(Action):
    def __init__(self):
        self.m = Mystem()
        self.countries = json.load(open(file, "r"))

    def name(self) -> Text:
        return "action_get_docs"

    def run(self, dispatcher: CollectingDispatcher, tracker: Tracker,
            domain: Dict[Text, Any]) -> List[Dict[Text, Any]]:
        input_country = tracker.get_slot('country')
        lemmas = self.m.lemmatize(input_country)
        country = lemmas[0].capitalize()
        found = False
        for i in self.countries:
            if country == i["country"] or input_country == i["country"].lower(
            ):
                if input_country == i["country"]:
                    country = i["country"]
                found = True
                dispatcher.utter_message(text=f"{i['documents']}")
                break

        if not found:
            dispatcher.utter_message(
                text=f"Я не знаю такую страну '{country}'")

        return []
Beispiel #22
0
def preprocessing(path):
    stem = Mystem()
    stop = set(stopwords.words("russian"))
    stop.update([
        '.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',
        '#', '№', '*', '_', '\n'
    ])

    def preprocess_text(input_text):
        param = re.sub('[^a-zA-Zа-яА-Я]', ' ', input_text)
        param.lower()
        param = stem.lemmatize(param)
        param = [
            token for token in param if token not in stop and token != " "
            and token.strip() not in punctuation
        ]
        input_text = " ".join(param)
        input_text = ' '.join(word for word in input_text.split()
                              if len(word) > 3)

        return input_text

    html_report_part1 = open(path, 'r')
    soup = BeautifulSoup(html_report_part1, 'html.parser')
    return preprocess_text(soup.get_text())
def pos_bi(text):
    pos_tags = []
    m = Mystem()
    sents = sent_tokenize(text)
    for sent in sents:
        sent_an = []
        analy = m.analyze(sent)
        for x in analy:
            try:
                if 'analysis' in x.keys():
                    tag = x['analysis'][0]['gr']
                    sent_an.append(re.sub(r'[=|,].*', '', tag).lower())
            except IndexError:
                pass
        pos_tags.append(sent_an)
    return pos_bi
Beispiel #24
0
class ActionInZone(Action):
    def __init__(self):
        self.countries = json.load(open(file, "r"))
        self.m = Mystem()
        self.schengens = [
            "Австрия", "Бельгия", "Чешская Республика", "Дания", "Эстония",
            "Финляндия", "Франция", "Германия", "Греция", "Венгрия",
            "Исландия", "Италия", "Латвия", "Литва", "Люксембург", "Мальта",
            "Голландия", "Норвегия", "Польша", "Португалия", "Словакия",
            "Словения", "Испания", "Швеция", "Швейцария", "Лихтенштейн"
        ]

    def name(self) -> Text:
        return "is_schengen_zone"

    def run(self, dispatcher: CollectingDispatcher, tracker: Tracker,
            domain: Dict[Text, Any]) -> List[Dict[Text, Any]]:
        country = tracker.get_slot('country')
        lemmas = self.m.lemmatize(country)
        country = lemmas[0].capitalize()
        if country in self.schengens:
            dispatcher.utter_message(text=f"{country} член шенгенской зоны")
        else:
            dispatcher.utter_message(
                text=f"{country} не входит в шенгенскую зону")

        return []
def text_preprocessing(text_col: pd.Series, stopwords: list) -> pd.Series:
    """
    Функция для предобработки названий и текстового описания товара.
    
    Args:
        text_col: Столбец с текстовой информацией.
        stopwords: Список стоп-слов.
    """

    lemmatize_func = Mystem().lemmatize

    pattern = r'\b(?:{})\b'.format('|'.join(stopwords))
    text = (
        text_col.str.lower().str.replace(
            r'<[^>]+>|[^a-zа-яё0-9]', ' ').str.replace(
                r'(\s)',
                ' ').str.strip()  # удаление пробелов в начале и в конце
        #         .apply(lambda x: lemmatize_func(x) if isinstance(x, str) else None) # лемматизация
        #         .apply(lambda x: ' '.join(x) if isinstance(x, list) else None) # соединение лемматизированных слов
        .str.replace(pattern, ' ')  # удаление стоп-слов
        .str.replace(r'\b(\w)\b', '')  # удаление слов из одной буквы
        .str.replace(r'\s+', ' ')  # любое количество пробелов на 1 пробел
        .str.strip()  # удаление пробелов в начале и в конце
    )

    return text
Beispiel #26
0
 def __init__(self,
              language='english',
              maxsents=0,
              lemma=False,
              cstlemma_dir=None):
     self.filepaths = None
     self.sentence_list = None
     self.language = language
     self.m = Mystem()
     self.maxsents = maxsents
     self.count = 0
     self.word_token_count = 0
     self.lemma = lemma
     self.cstlemma_dir = cstlemma_dir
     if not self.cstlemma_dir:
         self.cstlemma_dir = w2vconfig.cstlemma_dir
Beispiel #27
0
class Lemmatizer:
    def __init__(self, stop_words = None):
        self.stemmer = Mystem()
        self.cache = dict()#MyCache(maxsize=1000000)
        stop_words = stop_words if stop_words is not None else []
        self.stop_words = set(stop_words + [' ', '\n', '\r\n', '\t'])
        pass

    def lemmatize_word(self, word):
        res = self.cache.get(word, None)
        if res is not None:
            return res

        lm = self.stemmer.lemmatize(word)
        lm = [w for w in lm if w not in self.stop_words]

        if len(lm) == 0:
            return None

        lemmatized_word = max(lm, key=lambda x: len(x))



        self.cache[word] = lemmatized_word

        return lemmatized_word

    def fit_transform(self, words):
        if len(words) == 0:
            return []

        res = [self.lemmatize_word(w) for w in words]
        res = [w for w in res if w is not None]
        return res
class MyStemWrapper:
    def __init__(self, join_string: str = ''):
        self._join_string = join_string
        self._stemmer = Mystem()

    def filter_string(self, s: str):
        return self._join_string.join(self._stemmer.lemmatize(s)[:-1])
class TextsLematizer():
    def __init__(self):
        self.m = Mystem()

    # функция, проводящая предобработку текста
    def text_hangling(self, text: str):
        try:
            txt = re.sub('[^a-zа-я\d]', ' ', text.lower())
            txt = re.sub('\s+', ' ', txt)
            # сюда можно будет вложить самую разную обработку, в том числе и вариационную
            return txt
        except:
            return ""

    # функция лемматизации одного текста
    def text_lemmatize(self, text: str):
        try:
            lemm_txt = self.m.lemmatize(text)
            lemm_txt = [w for w in lemm_txt if w not in [' ', '\n']]
            return lemm_txt
        except:
            return ['']

    # функция лемматизации списка текстов текста
    def texts_lemmatize(self, texts_list):
        return [
            self.text_lemmatize(self.text_hangling(tx)) for tx in texts_list
        ]
Beispiel #30
0
def load_data(data_dir='news', parts=('train', 'test')):
    """
    Loads data from specified directory. Returns dictionary part->(list of texts, list of corresponding labels).
    """
    part2xy = {
    }  # tuple(list of texts, list of their labels) for train and test parts
    myStem = Mystem()
    for part in parts:
        print('Loading %s set ' % part)

        xpath = os.path.join(data_dir, '%s.texts' % part)
        with codecs.open(xpath, 'r', encoding='utf-8') as inp:
            wholeText = inp.read().strip()
            texts = lemmatize(myStem, wholeText).split('\n')

        ypath = os.path.join(data_dir, '%s.labels' % part)
        if os.path.exists(ypath):
            with codecs.open(ypath, 'r', encoding='utf-8') as inp:
                labels = [s.strip() for s in inp.readlines()]
            assert len(labels) == len(
                texts), 'Number of labels and texts differ in %s set!' % part
            for cls in set(labels):
                print(cls, sum((1 for l in labels if l == cls)))
        else:
            labels = None
            print('unlabeled', len(texts))

        part2xy[part] = (texts, labels)
    return part2xy
    def prep_lemmatize(self, text):
        """
        Eng:
        ===============================================================================
        :param text: Text for preprocessing;

        :return: Preprocessed text with all lemmatized words.

        Lemmatize all words with WordNet Lemmatizer.
        ===============================================================================

        Ru:
        ===============================================================================
        :param text: Текст для предобработки;

        :return: Обработанный текст, в котором каждое слово подвергнулось лемматизации.

        Лемматизирует все слова с помощью WordNet лемматизатора.
        ===============================================================================
        """
        if isinstance(text, str):
            if self.lang == "ru":
                return "".join(Mystem().lemmatize(text))
            return " ".join(
                [WordNetLemmatizer().lemmatize(word) for word in text.split()])
        else:
            raise TypeError("Argument must be str!")
Beispiel #32
0
def lemmatize(s):
    global m
    try:
        return ''.join(m.lemmatize(s)).strip()
    except BrokenPipeError as ex:
        m = Mystem()
        return lemmatize(s)
Beispiel #33
0
    def _lemmatize(self, text):
        from pymystem3 import Mystem

        mystem = Mystem()
        russian_stopwords = stopwords.words("russian")
        tokens = mystem.lemmatize(text.lower())
        tokens = [
            token
            for token in tokens
            if token not in russian_stopwords
            and token != " "
            and token.strip() not in punctuation
        ]
        text = " ".join(tokens)
        text = re.sub(" +", " ", text)
        return text
Beispiel #34
0
class Lemmatisation(object):
    def __init__(self):
        self.ru_lem = Mystem()
        self.en_lem = nltk.stem.WordNetLemmatizer()

        self.ru_stop_words = set(
            nltk.corpus.stopwords.words('russian') +
            [chr(i) for i in range(ord('а'),
                                   ord('я') + 1)])
        self.en_stop_words = set(
            nltk.corpus.stopwords.words('english') +
            [chr(i) for i in range(ord('a'),
                                   ord('z') + 1)])

    def visible(self, term):
        if re.search(
                NOT_DIGIT_OR_LETTER, term
        ) or term in self.ru_stop_words or term in self.en_stop_words:
            return False
        return True

    def _lemmatize(self, doc):
        lemmas = self.ru_lem.lemmatize(doc)
        lemmas = [
            self.en_lem.lemmatize(lemma) for lemma in lemmas
            if self.visible(lemma)
        ]
        return ' '.join(lemmas)

    def lemmatize(self, doc_id, doc):
        try:
            return self._lemmatize(doc)
        except Exception as e:
            print(doc_id, e)
Beispiel #35
0
def mystem_analyze(str):
    global m
    try:
        return m.analyze(str)
    except BrokenPipeError as ex:
        m = Mystem()
        return mystem_analyze(str)
 def calc_query_score(self, query):
     self.result_query[query] = {}
     result_docs = set(range(len(self.articles)))
     substr_list = set()
     query_1 = ''.join(Mystem().lemmatize(query)).strip()
     term = list()
     for word in query_1.split():
         if word[0] == '-':
             substr_list.update(set(self.word_map.get(word[1:], list())))
         elif self.word_map.get(word, None) is not None:
             term.append(word)
             result_docs.intersection_update(set(self.word_map.get(word)))
         else:
             result_docs = set()
             break
         result_docs.difference_update(substr_list)
     self.result_query[query] = {}
     for doc_num in result_docs:
         self.result_query[query][doc_num] = {}
         self.result_query[query][doc_num]['score'] = 0
         self.result_query[query][doc_num]['score_full'] = 0
         self.result_query[query][doc_num]['score_title'] = 0
         self.result_query[query][doc_num]['score_annotate'] = 0
         for word in term:
             self.add_tf_idf(query, word, doc_num, 'tf_idf', 'score')
             self.add_tf_idf(query, word, doc_num, 'tf_idf_full',
                             'score_full')
             self.add_tf_idf(query, word, doc_num, 'tf_idf_title',
                             'score_title')
             self.add_tf_idf(query, word, doc_num, 'tf_idf_annotate',
                             'score_annotate')
Beispiel #37
0
class Tokenizer:
    def __init__(self):
        self.space_pattern = re.compile(r'[^.А-ЯA-ZЁ]+', re.I)

        self.m = Mystem()

        try:
            with open('nw_model/stopwords.txt') as f:
                self.stop_words = set(f.read().split('\n')) | {''}
        except FileNotFoundError:
            self.stop_words = set()
            print(
                f'{Fore.RED}WARNING!!! Stop-words file not found!{Style.RESET_ALL}'
            )

    def tokenize_line(self, line):
        """
        Токенизирует одну строку
        :param line:
        :return: набор лексем (pymysteam)
        """
        try:
            return [
                word for word in self.m.lemmatize(
                    self.space_pattern.sub(' ', line.lower()))
                if word.strip() not in self.stop_words
            ]
        except BrokenPipeError:
            self.m = Mystem()
            return self.tokenize_line(line)

    def join(self, lst):
        return self.space_pattern.sub(' ', ' '.join(lst))
Beispiel #38
0
def get_okapi(query):
    """
    Returns Okapi BM25 score for every document given word in corpus
    :param query:
    :return:
    """
    m = Mystem()
    query = text_to_list(query, m)  # list of lemmas
    with open('result_1.json', 'r', encoding='utf-8') as f:
        f = f.read()
        data = json.loads(f)
        total_score = defaultdict(int)
        for word in query:
            try:
                all_info = (data[word])
                n = (len(all_info))
                for article in all_info:
                    fq = article['freq']
                    dl = get_dl(article['doc_name'])
                    score = score_BM25(n=n, fq=fq, dl=dl)
                    total_score[article['doc_name']] += score
            except KeyError:
                pass
        result = sorted(total_score.items(), key=lambda x: x[1],
                        reverse=True)[:10]
        return result
Beispiel #39
0
def preprocess(word):
    stem = Mystem()
    stop = set(stopwords.words("russian"))
    stop.update([
        '.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',
        '#', '№', '*', '_', '\n'
    ])
    param = re.sub('[^a-zA-Zа-яА-Я]', ' ', word)
    param.lower()
    param = stem.lemmatize(param)
    param = [
        token for token in param if token not in stop and token != " "
        and token.strip() not in punctuation
    ]
    word = " ".join(param)
    word = ' '.join(word for word in word.split() if len(word) > 3)
    return word
Beispiel #40
0
def with_not(directory):
    input_files = filter(lambda x: not x.endswith('~'), os.listdir(directory))
    output_data = {}
    m = Mystem()
    #иду по документам
    for input_file in input_files:
        with open(directory + '/' + input_file) as data_file:
            data = json.load(data_file)
        list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), data['text'].split(' '))


        # обработка не + (слово)
        nums_of_bigrams = []
        helping_words = [u'совсем', u'очень', u'слишком', u'самый']
        for i in range(0, len(list_of_terms)):
            if list_of_terms[i] == u'не' and list_of_terms[i+1] not in helping_words:
                if m.analyze(list_of_terms[i+1])[0].get(u'analysis'):
                    if not m.analyze(list_of_terms[i+1])[0][u'analysis'][0][u'gr'].startswith(u'S,'):
                        nums_of_bigrams.append((i, i+1))
            elif list_of_terms[i] == u'не' and list_of_terms[i+1] in helping_words:
                if m.analyze(list_of_terms[i+2])[0].get(u'analysis'):
                    if not m.analyze(list_of_terms[i+2])[0][u'analysis'][0][u'gr'].startswith(u'S,'):
                        nums_of_bigrams.append((i, i+2))
        for i in range(0, len(nums_of_bigrams)):
            if nums_of_bigrams[i][0] + 1 == nums_of_bigrams[i][1]:
                list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + list_of_terms[nums_of_bigrams[i][1]]
                list_of_terms[nums_of_bigrams[i][1]] = ''
            elif nums_of_bigrams[i][0] + 2 == nums_of_bigrams[i][1]:
                list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + list_of_terms[nums_of_bigrams[i][1]]
                list_of_terms[nums_of_bigrams[i][1] - 1] = ''
                list_of_terms[nums_of_bigrams[i][1]] = ''
        list_of_terms = filter(lambda x: x != '', list_of_terms)


        text_of_output = ' '.join(['%s' % term for term in list_of_terms])

        output_data[input_file] = {}
        output_data[input_file]['id'] = data['id']
        output_data[input_file]['positive'] = data['positive']
        output_data[input_file]['sarcasm'] = data['sarcasm']
        output_data[input_file]['text'] = text_of_output

        with open(directory + '/' + input_file, 'w') as output_file:
            json.dump(output_data[input_file], output_file)
Beispiel #41
0
class Runner(object):
    def __init__(self, input_text):
        self.lemmatize = None
        while True:
            response = raw_input("Do you want to lemmatize text first? (yes/no)\n").lower()
            if response == "yes":
                print "You should wait for a while"
                self.lemmatize = True
                self.stemmer = Mystem()
                break
            elif response == "no":
                self.lemmatize = False
                break

        self.word_lists = list()
        with open(input_text, "r") as f:
            for line in f:
                line += "."
                if self.lemmatize:
                    lexemes = self.stemmer.lemmatize(line)
                    word_list = list()  # список слов, неразделенных знаками пунктуации
                    for lexeme in lexemes:
                        lexeme = lexeme.strip()
                        if lexeme:
                            if lexeme.translate(None, '.,?!:;()"\' -\t\n'):  # проверка, что лексема не является знаком пунктуации
                                lexeme = lexeme.decode("utf-8")
                                if is_cyrillic(lexeme):
                                    word_list.append(lexeme)
                            else:  # иначе, добавить биграмы из списка и завести новый пустой список
                                self.word_lists.append(word_list)
                                word_list = list()
                else:
                    line = line.replace(".", " . ").replace(",", " , ").replace(":", " : ").replace(";", " ; ")\
                        .replace("?", " ? ").replace("!", " ! ").replace("(", " ( ").replace(")", " ) ")\
                        .replace("--", " -- ").replace(".", " . ")
                    word_list = list()
                    for lexeme in line.split():
                        # проверка, что лексема не является знаком пунктуации
                        lexeme = lexeme.translate(None, '.,?!:;()"\'').replace("--", "").decode("utf-8").strip().lower()
                        if lexeme:
                            if is_cyrillic(lexeme):
                                word_list.append(lexeme)
                        else:
                            if word_list:
                                self.word_lists.append(word_list)
                            word_list = list()

        train, test = self.split()
        self.lid = Lid(train, test)
        self.lid.run()

    def split(self):
        n = len(self.word_lists)
        train = self.word_lists[:n*9/10]
        test = self.word_lists[n*9/10:]
        return train, test
Beispiel #42
0
def mystem_using(input_directory, output_directory):
    input_files = filter(lambda x: not x.endswith('~'), os.listdir(input_directory))
    output_data = {}
    m = Mystem()
    for input_file in input_files:
        with open(input_directory + '/' + input_file) as data_file:
            data = json.load(data_file)
        list_of_terms = filter(lambda x: x != '', re.split(''' |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +''', data['text']))
        text = " ".join(["%s" % term for term in list_of_terms])
        list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), m.lemmatize(text))
        text_of_output = ' '.join(['%s' % term for term in list_of_terms])
        output_data[input_file] = {}
        output_data[input_file]['id'] = data['id']
        output_data[input_file]['positive'] = data['positive']
        output_data[input_file]['sarcasm'] = data['sarcasm']
        output_data[input_file]['text'] = text_of_output

        with open(output_directory + '/' + input_file, 'w') as output_file:
                    json.dump(output_data[input_file], output_file)
Beispiel #43
0
class Index(object):

    def __init__(self, input_file):
        self.stemmer = Mystem()
        self.documents = dict()
        self.tokens = list()
        self.terms = dict()
        self.index = list()

        # reading documents, making tokenization
        with open(input_file, "r") as f:
            for i, line in enumerate(f, start=1):
                self.documents[i] = line.decode("utf-8")
                for word in self.stemmer.lemmatize(line):
                    token = word.translate(None, '.,?!:;()"\'-').decode("utf-8").strip()
                    if token:
                        self.tokens.append((token, i))

        # sorting by tokens first, then by frequency
        self.tokens.sort(key=lambda tup: (tup[0], tup[1]))

        # terminization and building index
        current_term = self.tokens[0][0]
        current_doc_id = self.tokens[0][1]
        doc_ids = [current_doc_id]
        for token, doc_id in self.tokens:
            term = token.lower()
            if term == current_term:
                if doc_id != current_doc_id:
                    doc_ids.append(doc_id)
                    current_doc_id = doc_id
            else:
                self.terms[current_term] = (len(doc_ids), doc_ids)
                self.index.append((current_term, len(doc_ids), doc_ids))
                current_term = term
                current_doc_id = doc_id
                doc_ids = [doc_id]
        self.terms[current_term] = (len(doc_ids), doc_ids)
        self.index.append((current_term, len(doc_ids), doc_ids))

    def print_to_file(self):
        with open("result.txt", "w") as f:
            for term, count, doc_ids in self.index:
                f.write("{},\t{},\t{}\n".format(term.encode("utf-8"), count, doc_ids))

    def print_statistics(self):
        terms_num = len(self.terms)
        terms_len = 0.
        for term in self.terms:
            terms_len += len(term)

        print "***********************"
        print "Number of terms = {}".format(terms_num)
        print "Average term length = {}".format(terms_len / terms_num)
        print "***********************"
def search():
    cn = None
    file = codecs.open('static/articles.xml', 'r', 'utf-8')
    rfile = file.read()
    tree = lxml.etree.fromstring(rfile)
    res = tree.xpath('entry')
    categ = {
        'cat': 'Категория', 'wgroup': 'Группа слов с близким значением', 'comm': 'Комментарии',
        'stdiff': 'Стилистические различия', 'overlap': 'Совпадающая часть значения',
        'dom': 'Доминанта группы', 'diffmark': 'Различительные признаки, релевантные для данной группы',
        'diff': 'Смысловые различия', 'rare': 'Редкие слова, примыкающие к группе',
        'anmean': 'Другие значения слов, входящих в группу', 'comb': 'Сочетаемость', 'reg': 'Региональные варианты',
        'adict': 'Данные академических словарей', 'doc': 'Нормативные документы',
        'etim': 'Этимология', 'ill': 'Иллюстрации'
    }
    file.close()
    ms = Mystem()
    wordsearch = ms.lemmatize(request.form['search'].lower())[0]

    for i in res:
        if wordsearch == '':
            cn = 'Пустой запрос'
        elif i.text.lower().startswith(wordsearch):
            arr = []
            for j in i.iter():
                for k in dict.keys(categ):
                    if j.tag == k:
                        if j.text != 'null':
                            arr.append('<font size="4"><b>' + str(categ[j.tag]) + '</b></font><br>' + str(j.text))
                text = '<br><br>'.join([j for j in arr[1:]])
                text = re.sub('\*', '<b>', text)
                text = re.sub('\#', '</b>', text)
                text = re.sub('\$', '<i>', text)
                text = re.sub('\%', '</i>', text)
                text = re.sub('\@', '<font color="#696969">', text)
                text = re.sub('\+', '</font>', text)
                cn = '<strong><big>' + i.text + '</big></strong><br><br>' + re.sub('\n', '<br>', text)
            break
        else:
            cn = 'По Вашему запросу ничего не найдено. <br>' \
                 'Попробуйте использовать "Поиск по тегу" или измените запрос.'
    return render_template('search.html', cn=Markup(cn))
Beispiel #45
0
    def build_pos(self):

        m = Mystem()
        counter = Counter(DEFAULTS)

        for doc in self.documents:

            # parse with mystem
            data = m.analyze(doc.text)

            # get POS and count for each sentence
            pos = [word.get('analysis', None)[0]['gr'].split('(')[0].split(',')[0].split('=')[0]
                   for word in data if word.get('analysis', None)]
            counter.update(pos)

            # append to dataset
            self.pos_data.append([counter[key] for key in sorted(counter)])

            # reset counter
            counter = Counter(DEFAULTS)
Beispiel #46
0
def produce_lemmas(connection, tableName, outputTableName):
    mystem = Mystem()
    cursor = connection.cursor()
    inserter = connection.cursor()

    query = 'DELETE FROM `%s`' % outputTableName
    inserter.execute(query)
    connection.commit()

    query = 'SELECT * FROM `%s`' % tableName
    cursor.execute(query)
    query = 'INSERT INTO `' + outputTableName + '` (`' + tableName + '_id`, `word_class_id`, `lex`, `gr`)' \
            'SELECT %i, `id`, "%s", "%s" FROM `word_classes` WHERE `abbr`="%s"'
    for id, concept, scheme in cursor:
        lemmas = mystem.analyze(concept)
        for lemma in lemmas:
            for analysis in lemma.get('analysis', []):
                inserter.execute(query % prepare_content(id, analysis))
    connection.commit()

    cursor.close()
Beispiel #47
0
class Index(object):
    def __init__(self, input_file):
        self.stemmer = Mystem()
        self.tokens = list()
        self.index = dict()
        self.number_of_documents = 0

        try:
            self.read_from_file_compressed("index_compressed.txt")
        except:
            # reading documents, making tokenization
            with open(input_file, "r") as f:
                for line in f:
                    self.number_of_documents += 1
                    # self.documents[i] = line.decode("utf-8")
                    for word in self.stemmer.lemmatize(line):
                        token = word.translate(None, '.,?!:;()"\'-').decode("utf-8").strip()
                        if token:
                            self.tokens.append((token, self.number_of_documents))

            # sorting by tokens first, then by frequency
            self.tokens.sort(key=lambda tup: (tup[0], tup[1]))

            # terminization and building index
            current_term = self.tokens[0][0]
            current_doc_id = self.tokens[0][1]
            doc_ids = [current_doc_id]
            for token, doc_id in self.tokens:
                term = token.lower()
                if term == current_term:
                    if doc_id != current_doc_id:
                        doc_ids.append(doc_id)
                        current_doc_id = doc_id
                else:
                    self.index[current_term] = (len(doc_ids), pack_doc_ids(doc_ids))
                    current_term = term
                    current_doc_id = doc_id
                    doc_ids = [doc_id]
            self.index[current_term] = (len(doc_ids), pack_doc_ids(doc_ids))
            del self.tokens
            self.write_index_in_file()

    def write_index_in_file(self):
        with open("index_compressed.txt", "w") as f:
            pickle.dump(self.index, f)

    def read_from_file_compressed(self, index_file):
        with open(index_file, "r") as f:
            self.index = pickle.load(f)
Beispiel #48
0
def fill_mystem():
    from pymystem3 import Mystem
    m = Mystem()
    for sentence in get_sentences(1):
        lemmas = m.analyze(sentence.source)
        items = list()
        for lemma in lemmas:
            text = lemma['text']
            analysis = lemma.get('analysis')
            if not analysis:
                text = text.strip()
                if not len(text):
                    print 'spaces = "%s"' % text
                    continue
                if ' ' in text:
                    for item in re.split('\s+', text):
                        items.append("%s   %s ?" % (item, item))
                    print 'several =', "|".join(re.split('\s+', text))
                    continue
                print 'delimiter = "%s"' % text
                items.append("%s   %s ?" % (text, text))
                continue

            if not len(text.strip()):
                raise Exception('Impossible')
            if ' ' in text:
                raise Exception('Impossible')

            lexemes = list()
            for lexeme in analysis:
                print 'lex=', lexeme.get('lex', '-')
                print 'gr=', lexeme.get('gr', '-')
                lexemes.append("%s %s" % (lexeme['lex'], lexeme['gr']))
            items.append("%s   %s" % (text, '  '.join(lexemes)))
        sentence.mystem = '\n'.join(items)
        sentence.save()
Beispiel #49
0
    def __init__(self, input_file):
        self.stemmer = Mystem()
        self.documents = dict()
        self.tokens = list()
        self.terms = dict()
        self.index = list()

        # reading documents, making tokenization
        with open(input_file, "r") as f:
            for i, line in enumerate(f, start=1):
                self.documents[i] = line.decode("utf-8")
                for word in self.stemmer.lemmatize(line):
                    token = word.translate(None, '.,?!:;()"\'-').decode("utf-8").strip()
                    if token:
                        self.tokens.append((token, i))

        # sorting by tokens first, then by frequency
        self.tokens.sort(key=lambda tup: (tup[0], tup[1]))

        # terminization and building index
        current_term = self.tokens[0][0]
        current_doc_id = self.tokens[0][1]
        doc_ids = [current_doc_id]
        for token, doc_id in self.tokens:
            term = token.lower()
            if term == current_term:
                if doc_id != current_doc_id:
                    doc_ids.append(doc_id)
                    current_doc_id = doc_id
            else:
                self.terms[current_term] = (len(doc_ids), doc_ids)
                self.index.append((current_term, len(doc_ids), doc_ids))
                current_term = term
                current_doc_id = doc_id
                doc_ids = [doc_id]
        self.terms[current_term] = (len(doc_ids), doc_ids)
        self.index.append((current_term, len(doc_ids), doc_ids))
import json
import requests
import pandas

from pymystem3 import Mystem

API_KEY = "api_key"

if __name__ == "__main__":
    not_translated = []
    dictionary = {}
    print(len(dictionary.keys()))
    m = Mystem()
    df = pandas.read_csv("/media/alexander/b32bf4b4-8724-4107-9d19-abf6615c2f60/alexander/HELP_FILE/query.yaHotelId.showInTop.sure.final.tsv", sep="\t")
    df_size = len(df["query"])
    k = 1
    for line in df["query"]:
        print(k, "query from", df_size)
        k += 1
        for word in line.strip().split():
            lema_word = m.lemmatize(word)[0]
            if dictionary.get(lema_word) is None:
                params = {"key": API_KEY, "text": lema_word, "lang": "ru-en"}
                try:
                    r = requests.get("https://translate.yandex.net/api/v1.5/tr.json/translate", params=params)
                    r_json = r.json()
                    trans_word = r_json["text"][0]
                    if r_json["code"] != 200:
                        print("ERROR", r_json["code"])
                        not_translated.append(lema_word)
                        continue
Beispiel #51
0
from pymystem3 import Mystem
m = Mystem()

t = 'Чайника, сегодня не было'
lemma = m.lemmatize(t)


def lemmas(text):
    punc = list('.?!-;:",')
    text = [i for i in text if i not in punc]
    text = ''.join(text)
    text = m.lemmatize(text)
    textn = ''
    for w in text:
        if w is not ' ' or '\n':
            textn += w
    return textn


from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import os

s_w = stopwords.words('russian')
sw = [i for i in s_w]

v = TfidfVectorizer(stop_words=sw) # убираем стоп-слова
#v = TfidfVectorizer() # не убираем стоп-слова

totalCorpus = []
suspenseCorpus = ''
Beispiel #52
0
class MystemOCTagger(object):
	def __init__(self):
		self.mystem_inst = Mystem()


	def run_and_convert(self, input_file, output_file, strict_match = False):
		f_in = open(input_file, 'rb')
		f_out = open(output_file, 'w+')
		context = etree.iterparse(f_in, tag='sentence')
		for event, sentence_elem in context:
			sentence = sentence_elem.find('source')
			analyzed = self.analyze_sentence(sentence.text)
			tokens_tree = sentence_elem.find('tokens')
			tokens = self.extract_tokens(tokens_tree)
			matched = self.match_analyzed_tokens(tokens, analyzed, strict_match)

			result = self.analyzed_to_csv_list(matched)
			for s in result:
				f_out.write(s+'\n')

			sentence_elem.clear()

	def analyze_sentence(self, sentence):
		return self.mystem_inst.analyze(sentence)

	# builds word-index mapping, indices sorted in order of appearance
	def extract_tokens(self, tokens_tree):
		tokens_dict = {}
		for t in tokens_tree.iter('token'):
			idx = t.get('id')
			token = t.get('text')
			token = strip_word(token)
			if (len(token) > 0):
				if token in tokens_dict:
					tokens_dict.get(token).append(idx)
				else:
					tokens_dict[token] = [idx]

		return tokens_dict


	# matches analysis with original tokens indices   
	def match_analyzed_tokens(self, tokens_index, analyzed, strict_match = False):
		analysis_indexed = {}
		unindexed = []
		for t in analyzed:
			t_text = t.get('text')
			t_text = strip_word(t_text)
			if len(t_text) > 0:
				if t_text in tokens_index:
					idx = tokens_index.get(t_text).pop(0)
					if (len(tokens_index.get(t_text)) == 0):
						tokens_index.pop(t_text)
					analysis_indexed[idx] = t.get('analysis')
				else:
					unindexed.append(t)

		if (not strict_match):
			analysis_not_strict = {}
			if len(tokens_index) > 0:
				analysis_not_strict = self.match_not_strict(tokens_index, unindexed)

			analysis_indexed.update(analysis_not_strict)

		not_analyzed = []
		if len(tokens_index) > 0:
			for t in tokens_index:
				not_analyzed.append(t)

#		if len(not_analyzed) > 0:
#			f_unindexed = open('mismatch.txt', 'a+')
#			f_unindexed.write('oc ')
#			f_unindexed.write(str(not_analyzed)+'  ')
#
#			if len(unindexed) > 0:
#				f_unindexed = open('mismatch.txt', 'a+')
#				for u in unindexed:
#					f_unindexed.write(' ')
#					f_unindexed.write(str(u.get('text')))

#			f_unindexed.write('\n')


		return analysis_indexed

	def match_not_strict(self, tokens_index, analyzed):
		analysis_indexed = {}
		for t_indexed, idx_list in tokens_index.items():
			for idx in idx_list:
				for i in range(0, len(analyzed)):
					t_analyzed = analyzed[i]
					if t_indexed.endswith(t_analyzed.get('text')):
						analysis_indexed[idx] = t_analyzed.get('analysis')
						#print(t_analyzed.get('text')+' '+t_indexed)
						analyzed.pop(i)
						idx_list.remove(idx)
						break

		idx_copy = tokens_index.copy()
		for t, i in idx_copy.items():
			if len(i) == 0:
				del tokens_index[t]


		return analysis_indexed

	def analyzed_to_csv_list(self, analyzed):
		out = []
		for idx, analysis in sorted(analyzed.items()):
			if analysis and len(analysis) > 0:
				#do we need only grammar?        
				s = str(idx) + ', ' + str(analysis[0].get('gr'))
				out.append(s)

		return out
Beispiel #53
0
	def __init__(self):
		self.mystem_inst = Mystem()
Beispiel #54
0
def poehali(csv_input):
	'''
	Основная функция
	csv_input -- файл с таблицей ссылок
	На выходе
	|-xmlFile/
	|---------year/
	|--------------month/
	=========
	|-plain/
	|-------year/
	|------------month/
	=========
	|-html/
	|------year/
	|-----------month/
	|csv_file.csv

	'''
	data = []
	i = 0
	m = Mystem()
	gusina()
	col = ["path", "author", "sex", "birthday", "header", "created", "sphere", "genre_fi", "type", "topic", "chronotop", "style", "audience_age", "audience_level", "audience_size", "source", "publication", "publisher", "publ_year", "medium", "country", "region", "language"]
	time.sleep(3)

	path = os.getcwd()
	path = path + "/"
	csv_file = open(path + "csv_file.csv", "w")
	writer = csv.writer(csv_file,delimiter = ",")
	writer.writerow(col)


	dosugvbryanske = re.compile("^(http://www.briansk.ru/)(.+)")

	with open(csv_input) as csvfile:
		reader = csv.DictReader(csvfile)
		for row in reader:
			if re.search(dosugvbryanske, row['url']):
				print('passing on ' + str(i))

				test = urllib.request.urlopen(row['url']).read().decode('cp1251')
				file_html = path+"/"+str(i)+".html"
				file_html1 = path+"/"+str(i-1)+".html"
				dest_html = str(i)+".html"
				plain = str(i)+".txt"
				plain_new = str(i)+"_plained.txt"
				plain_stem = str(i)+"_mystem.txt"
				output_plain_stem = str(i)+"_out_mystem.txt"
				xmlFile = str(i) + ".xml"
				#dir_for_stem = "XML_STEM"

				page1_html = open(file_html, 'w')
				page1_html.write(str(test))
				page1_html.close()
				print("FILE EX: "+ str(os.path.exists(file_html)))
				pageMoving = open(file_html, 'r')
				#print(file_html + " PATH " + dest_html+"\n")
				if os.path.exists(file_html1):
					os.remove(file_html1)
					print("FILE "+str(i-1)+" HB REMOVED")
				else:
					print("FILE "+str(i-1)+" HB ALREADY MOVED")
				for line in pageMoving:
					data = re.search(r"\">[0-9]{1,2}\s{1}((янв|февр|март|апре|май|июнь|июль|авг|сентя|октяб|нояб|декаб)[а-я]{1,}\s[0-9]{4})|\">[0-9]{1,2}\s{1}(ма(а|я)\s[0-9]{4})", line)
					if data:

						'''
						Определение датирования статьи
						'''
						dates = data.group()
						dates2 = dates.split()
						year = dates2[2]
						month = dates2[1]

						create_folder(path, year, transpose_month(month), "html")
						shutil.move(file_html, path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html)
						print("FILE "+str(i)+" HB MOVED")


						'''
						Созидание директории для XML
						'''
						create_folder(path, year, transpose_month(month), "xmlFile")

						forxml = path+"xmlFile/"+year+"/"+transpose_month(month)+"/"+dest_html
						forxml_dir = path+"xmlFile/"+year+"/"+transpose_month(month)+"/"
						xml_stem = forxml_dir + str(i) + "_mystem.xml"
						rofxml = path+"xmlFile/"+year+"/"+transpose_month(month)+"/"+xmlFile

						'''
						Копирование html -> xmldir для дальнейшей обработки
						'''

						shutil.copy(path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html, forxml)
						print("FILE "+str(i)+" HB COPIED TO XML")
						openindosug_xml = open(forxml, "r")
						read_and_clean_xml = openindosug_xml.read()
						xml_data = amixml(read_and_clean_xml)
						#print(xml_data[2])
						openindosug_xml.close()
						'''
						Созидание директории для plain текста
						'''
						
						create_folder(path, year, transpose_month(month), "plain")
						forplain = path+"plain/"+year+"/"+transpose_month(month)+"/"+dest_html
						forplain_dir = path+"plain/"+year+"/"+transpose_month(month)+"/"
						shutil.copy(path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html, forplain)
						print("FILE "+str(i)+" HB COPIED TO PLAIN")
						openindosug = open(forplain, "r")

						dates = re.sub("\">", "", dates)


						'''
						wri = лист для генерации ИНФО о статьи
						'''

						wri = ["briansk.ru", str(xml_data[1]), toddmmyyy(dates), "", row['url']]


						page2_txt = open(str(forplain_dir)+str(plain), 'w')
						for datline in openindosug:
							page2_txt.write(str(make_it_clean(datline)))
						
						page2_txt.close()
						print("PLAIN FOR "+str(i)+" HB CREATED")

						'''
						Окончательная очистка plain файла; оставляем только текст статьи или текст + ИНФО
						'''
						provide_clean_file(forplain_dir+str(plain),forplain_dir+str(plain_new), wri, "extra")
						provide_clean_file(forplain_dir+str(plain),forplain_dir+str(plain_stem), wri, "mystem")
						os.remove(forplain_dir+str(plain))
						os.remove(forplain)
						openindosug.close()

						'''
						xml_data[0] -- content
						xml_data[1] -- headerTag
						xml_data[2] -- content date
						'''

						'''
						Генерация XML
						'''
						pageEtree = etree.Element('html')
						doc = etree.ElementTree(pageEtree)
						infoTag = etree.SubElement(pageEtree, "body")
						dateTag = etree.SubElement(infoTag, "h1")
						dateTag.text = str(xml_data[2])
						headerTag = etree.SubElement(infoTag, "h2")
						headerTag.text = str(xml_data[1])
						mainTag = etree.SubElement(infoTag, "h3")
						contentTag = etree.SubElement(infoTag, "h4")
						contentTag.text = str(xml_data[0])
						outFile = open(str(forxml_dir)+str(i)+".xml", 'wb')
						doc.write(outFile, xml_declaration=True, encoding='utf-16') 
						outFile.close()
						print("FILE "+str(i)+" HB CODED TO XML")

						writer.writerow([str(path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html) , "briansk.ru" , "" , "" , str(xml_data[1]) , toddmmyyy(dates), 'публицистика' , "" , "" , "категория" , "" , "нейтральный" , "н-возраст" , "н-уровень" , "городская" , str(row['url']) , "брянск.ru" , "" , str(year) , "газета" , "Россия" , "БРЯНСК" , "ru"])
						os.remove(forxml)


						input_plain = forplain_dir + plain_stem
						output_plain = forplain_dir + output_plain_stem


						'''
						pystem
						mystem 

						'''
						
						with open(input_plain) as file:
						    text = file.read()
						

						lemmas = m.lemmatize(text)
						with open(input_plain, 'w') as file:
							file.write(''.join(lemmas))

						os.system(r'/home/haniani/Загрузки/mystem -icd '+ input_plain + ' ' + output_plain)
						os.system(r'/home/haniani/Загрузки/mystem -icd --format xml '+ input_plain +' '+ xml_stem)
						

						print("MYSTEM'ed "+str(i))
						break

				i += 1
				print("PASSED ; NEXT: "+str(i)+"\n")
	csv_file.close()
	        
	for file in glob.glob(path+"*.html"):
		os.remove(file)
Beispiel #55
0
# coding:utf-8
"""
Script normalizing sentences from sentences.txt
and saving ones to
"""
import re
from pymystem3 import Mystem

normalizer = Mystem()

PREFIX = "mp_"

with open("../data/" + PREFIX + "norm_sentences.txt", "w") as writer:
    count = 0
    raw = []
    normalized = []

    for line in open("../data/" + PREFIX + "parsed.txt"):

        if count % 1000 == 0:
            print count

        line = re.sub("[\W\d]+", " ", line.strip().decode("utf-8").strip(), 0, re.UNICODE)
        line = re.sub("\s+", " ", line.strip(), 0, re.UNICODE).lower()
        raw.extend(line.split(" "))
        writer.write("* " + line.encode("utf-8") + " **;")
        # print line, '->',
        line = " ".join(normalizer.lemmatize(line))
        line = re.sub("\s+", " ", line, 0, re.UNICODE)
        lemmatized = filter(lambda x: len(x.strip()) > 0, normalizer.lemmatize(line))
        normalized.extend(lemmatized)
    def extract(self):
        try:
            #вычисляем, сколько в директории лежит файлов
            input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
            output_data = {}
            list_of_all_terms = {}
            m = Mystem()
            #иду по документам
            for file in input_files:
                with open(self.input_directory + '/' + file) as data_file:
                    data = json.load(data_file)
                list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text']))
                text = " ".join(["%s" % term for term in list_of_terms])
                list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))


                # обработка не + (слово)
                nums_of_bigrams = []
                helping_words = [u'совсем', u'очень', u'слишком', u'самый']
                for i in range(0, len(list_of_terms)):
                    if list_of_terms[i] == u'не' and list_of_terms[i+1] not in helping_words:
                        nums_of_bigrams.append((i, i+1))
                    elif list_of_terms == u'не' and list_of_terms[i+1] in helping_words:
                        nums_of_bigrams.append((i, i+2))
                for i in range(0, len(nums_of_bigrams)):
                    if nums_of_bigrams[i][0] + 1 == nums_of_bigrams[i][1]:
                        list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + ' ' + list_of_terms[nums_of_bigrams[i][1]]
                        list_of_terms[nums_of_bigrams[i][1]] = ''
                    elif nums_of_bigrams[i][0] + 2 == nums_of_bigrams[i][1]:
                        list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + ' ' + list_of_terms[nums_of_bigrams[i][1]]
                        list_of_terms[nums_of_bigrams[i][1] - 1] = ''
                        list_of_terms[nums_of_bigrams[i][1]] = ''
                list_of_terms = filter(lambda x: x != '', list_of_terms)

                output_data[file] = {}
                output_data[file]['id'] = data['id']
                output_data[file]['positive'] = data['positive']
                output_data[file]['sarcasm'] = data['sarcasm']
                output_data[file]['terms'] = {}
                #убираю повторяющиеся слова
                for term in list_of_terms:
                    if term not in output_data[file]['terms']:
                        output_data[file]['terms'][term] = 1
                    else:
                        output_data[file]['terms'][term] += 1
                for term in output_data[file]['terms'].keys():
                    if term not in list_of_all_terms:
                        list_of_all_terms[term] = 1
                    else:
                        list_of_all_terms[term] += 1
                    #подсчёт tf
                    count_of_terms = output_data[file]['terms'][term]
                    output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0,
                                                        'count': count_of_terms}

            for file in input_files:
                #подсчёт idf
                for term in output_data[file]['terms'].keys():
                    output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term])
                #запись результата
                with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
                    json.dump(output_data[file], output_file)
        except Exception:
            return False
        else:
            return True
Beispiel #57
0
def lemma(text):
    m = Mystem()
    lemmas = m.lemmatize(text)
    titleStemmed = ''.join(lemmas)
    return titleStemmed
    def extract(self):
        try:
            # вычисляем, сколько в директории лежит файлов
            input_files = filter(lambda x: not x.endswith("~"), os.listdir(self.input_directory))
            output_data = {}
            list_of_all_terms = {}
            m = Mystem()
            # иду по документам
            for file in input_files:
                with open(self.input_directory + "/" + file) as data_file:
                    data = json.load(data_file)
                list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data["text"]))
                text = " ".join(["%s" % term for term in list_of_terms])
                list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))
                my_list_of_terms = []
                for term in list_of_terms:
                    my_term = term
                    term = u""
                    prev_letter = my_term[0]
                    term += my_term[0]
                    for i in range(1, len(my_term)):
                        if my_term[i] != prev_letter:
                            term += my_term[i]
                        prev_letter = my_term[i]
                    my_list_of_terms.append(term)
                list_of_terms = my_list_of_terms
                output_data[file] = {}
                output_data[file]["id"] = data["id"]
                output_data[file]["positive"] = data["positive"]
                output_data[file]["sarcasm"] = data["sarcasm"]
                output_data[file]["terms"] = {}
                # убираю повторяющиеся слова
                for term in list_of_terms:
                    if term not in output_data[file]["terms"]:
                        output_data[file]["terms"][term] = 1
                    else:
                        output_data[file]["terms"][term] += 1
                for term in output_data[file]["terms"].keys():
                    if term not in list_of_all_terms:
                        list_of_all_terms[term] = 1
                    else:
                        list_of_all_terms[term] += 1
                    # подсчёт tf
                    count_of_terms = output_data[file]["terms"][term]
                    output_data[file]["terms"][term] = {
                        "tf": float(count_of_terms) / len(list_of_terms),
                        "idf": 0,
                        "count": count_of_terms,
                    }

            for file in input_files:
                # подсчёт idf
                for term in output_data[file]["terms"].keys():
                    output_data[file]["terms"][term]["idf"] = math.log(
                        float(len(input_files)) / list_of_all_terms[term]
                    )
                # запись результата
                with open(self.output_directory + "/" + file + "_tf-idf", "w") as output_file:
                    json.dump(output_data[file], output_file)
        except Exception:
            return False
        else:
            return True
Beispiel #59
0
from pymystem3 import Mystem
import logging
import re

logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(message)s',
                    handlers=[logging.StreamHandler()])

logging.info("Loading mystem")
m = Mystem()
logging.info("Loaded mystem")


def parse_gr(gr):
    options = re.search('\(([^\)]*)\)', gr, re.IGNORECASE)

    if options:
        title = options.group(1)
        for stuff in title.split('|'):
            yield gr.replace("(" + title + ")", stuff)
    else:
        yield gr


lines = set([])

with open("data/test.txt", "r") as input_file:
    logging.info("file opened")

    for line in input_file:
        for w in m.analyze(line):
Beispiel #60
0
import os, json, dicttoxml
from pymystem3 import Mystem

m = Mystem()
top = 'C:\\Users\\John\\Desktop\\py_files\\питон\\korpus\\no_marks'
for root, dirs, files in os.walk(top):
    for name in files:
        loc = os.path.join(root, name)
        loc_list = loc.split('\\')  #creates list in order to remove path content
        new_root = loc.replace('\\no_marks\\{0}\\{1}\\{2}'.format(loc_list[8], loc_list[9], loc_list[10]), '') #removes path ending
        dir_marks = os.path.join(new_root + '\\marks\\{0}\\{1}'.format(loc_list[8], loc_list[9]))   #adds new path ending for json.docs
        dir_xml = os.path.join(new_root + '\\xml\\{0}\\{1}'.format(loc_list[8], loc_list[9]))       #adds new path ending for xml docs
        new_name = name.replace('.txt', '')
        if not os.path.exists(dir_marks):   #makes nesessary dirs if not present
            os.makedirs(dir_marks)
        if not os.path.exists(dir_xml):
            os.makedirs(dir_xml)
        with open(loc, "r", encoding = 'utf-8') as doc:
            text_doc = doc.read()
            lines = doc.readlines()
            info = json.dumps(m.analyze(text_doc), ensure_ascii = False)  #creates text file with gram and lem info
        with open("{0}\\{1}.json".format(dir_marks, new_name), 'w', encoding = 'utf-8') as doc_marks:
            doc_marks.write(info)
        xml = dicttoxml.dicttoxml(info).decode('utf-8')     #converts json to xml
        with open("{0}\\{1}.xml".format(dir_xml, new_name), 'w', encoding = 'utf-8') as doc_xml:
            doc_xml.write(xml)