Esempio n. 1
0
    def __init__(self, parent=None):
        QWidget.__init__(self, parent)

        self.setMinimumSize(600, 400)

        random.seed(version=2)

        self.__graph_editor = GraphEditor()

        self.__morph = MorphAnalyzer()

        self.__btn_save = QPushButton("Экспорт..", self)
        self.__btn_delete = QPushButton("Правка", self)

        self.__export_menu = QMenu()
        self.__action_to_owl = self.__export_menu.addAction("Web-онтология (owl-файл)")
        self.__action_to_svg = self.__export_menu.addAction("Файл векторной графики (svg-файл)")
        self.__action_to_cmap = self.__export_menu.addAction("Концептуальная карта (txt-файл)")
        self.__action_to_png = self.__export_menu.addAction("Изображение")

        self.__btn_save.setMenu(self.__export_menu)

        self.__export_menu.triggered[QAction].connect(self.__process_export_menu)

        self.__edit_menu = QMenu()
        self.__action_remove_selected = self.__edit_menu.addAction("Удалить выбранные элементы")

        self.__btn_delete.setMenu(self.__edit_menu)

        self.__edit_menu.triggered[QAction].connect(self.__process_edit_menu)

        self.__setup_ui()
Esempio n. 2
0
    def __init__(self, text=None, tokens=None):
        self.__text = text

        self.__tokens = tokens

        self.__morph_analyzer = MorphAnalyzer()

        self.__doc = None
Esempio n. 3
0
    def __init__(self, content=None, parent=None):
        super(LinguisticAnalysisWidget, self).__init__(parent)

        self.setWindowTitle("Лингвистический анализатор")

        self.__morph = MorphAnalyzer()

        self.__line_edit = QLineEdit()
        self.__push_button = QPushButton("Анализ")

        self.__linguistic = QLabel()
        self.__all_results = QLabel()

        self.setup_ui()
Esempio n. 4
0
    def __init__(self, text=None):
        self.__text = text

        self.__tokens = None

        self.__morph_analyzer = MorphAnalyzer()

        self.__frequency = None

        self.__category = None

        self.__text_classifier = None

        self.__common_dir = os.getcwd() + '/common/'

        self.train_classifier()
Esempio n. 5
0
class SemanticAnalysisWidget(QWidget):
    def __init__(self, parent=None):
        QWidget.__init__(self, parent)

        self.setMinimumSize(600, 400)

        random.seed(version=2)

        self.__graph_editor = GraphEditor()

        self.__morph = MorphAnalyzer()

        self.__btn_save = QPushButton("Экспорт..", self)
        self.__btn_delete = QPushButton("Правка", self)

        self.__export_menu = QMenu()
        self.__action_to_owl = self.__export_menu.addAction("Web-онтология (owl-файл)")
        self.__action_to_svg = self.__export_menu.addAction("Файл векторной графики (svg-файл)")
        self.__action_to_cmap = self.__export_menu.addAction("Концептуальная карта (txt-файл)")
        self.__action_to_png = self.__export_menu.addAction("Изображение")

        self.__btn_save.setMenu(self.__export_menu)

        self.__export_menu.triggered[QAction].connect(self.__process_export_menu)

        self.__edit_menu = QMenu()
        self.__action_remove_selected = self.__edit_menu.addAction("Удалить выбранные элементы")

        self.__btn_delete.setMenu(self.__edit_menu)

        self.__edit_menu.triggered[QAction].connect(self.__process_edit_menu)

        self.__setup_ui()

    def __setup_ui(self):

        vbox_layout = QVBoxLayout()

        self.__btn_save.setMaximumWidth(100)
        self.__btn_delete.setMaximumWidth(100)

        hbox_layout = QHBoxLayout()

        spacer = QSpacerItem(40, 20, QSizePolicy.Expanding, QSizePolicy.Minimum)

        hbox_layout.addWidget(self.__btn_save)
        hbox_layout.addItem(spacer)
        hbox_layout.addWidget(self.__btn_delete)

        vbox_layout.addLayout(hbox_layout)

        vbox_layout.addWidget(self.__graph_editor)

        self.setLayout(vbox_layout)

    def __remove_selected_items(self):
        msg = QMessageBox()

        msg.setIcon(QMessageBox.Question)
        msg.setText("Вы действительно хотите удалить выделенные блоки?")
        msg.setInformativeText("Это действие нельзя отменить")
        msg.setWindowTitle("Подтвердите действие")
        msg.setDetailedText("При удалении блока удалятся все его соединения!")

        msg.setStandardButtons(QMessageBox.Ok | QMessageBox.Cancel)

        if len(self.__graph_editor.diagramScene.selectedItems()) > 0:
            confirm = msg.exec_()

            if confirm == QMessageBox.Ok:
                self.__graph_editor.remove_selected_items()

    def __process_edit_menu(self, menu_item):

        if menu_item.text() == "Удалить выбранные элементы":
            self.__remove_selected_items()

    def __process_export_menu(self, menu_item):

        if menu_item.text() == "Web-онтология (owl-файл)":
            self.export_to_owl()
        elif menu_item.text() == "Файл векторной графики (svg-файл)":
            self.export_to_svg()
        elif menu_item.text() == "Концептуальная карта (txt-файл)":
            self.export_to_cmap()
        elif menu_item.text() == "Изображение":
            self.export_to_png()

    def export_to_png(self):
        """
        Graph editor's scene rendering & saving to png-file
        """

        filename = QFileDialog.getSaveFileName(None, 'Сохранить в формате png', "", "Images (*.png)")

        if len(filename[0]) > 4:
            w = self.__graph_editor.get_diagram_scene().width()
            h = self.__graph_editor.get_diagram_scene().height()

            image = QImage(w, h, QImage.Format_ARGB32_Premultiplied)
            image.fill(Qt.white)
            # pix = QPixmap(w, h)
            painter = QPainter()
            painter.begin(image)
            painter.setRenderHint(QPainter.Antialiasing)
            self.__graph_editor.get_diagram_scene().render(painter)
            painter.end()

            image.save(filename[0])

    def export_to_svg(self):
        """
        Graph editor's scene rendering & saving to svg-file
        :return: None
        """

        filename = QFileDialog.getSaveFileName(None, 'Сохранить SVG-граф', "", "svg files (*.svg)")

        if len(filename[0]) > 4:
            svg = QSvgGenerator()
            svg.setFileName(filename[0])

            w = self.__graph_editor.get_diagram_scene().width()
            h = self.__graph_editor.get_diagram_scene().height()

            svg.setSize(QSize(w, h))
            svg.setViewBox(QRect(0, 0, w, h))
            svg.setTitle('Semantic Graph')
            svg.setDescription('File created by RTA')

            painter = QPainter()
            painter.begin(svg)
            self.__graph_editor.get_diagram_scene().render(painter)
            painter.end()

    def export_to_cmap(self):
        """
        Graph editor's relations saving to txt-file
        :return: None
        """

        filename = QFileDialog.getSaveFileName(None, 'Экспорт в CMap', "", "Text files (*.txt)")

        if len(filename[0]) > 4:
            out = ""

            processed_nodes = []

            for connection in self.__graph_editor.get_all_connections():
                first_node = connection.get_first_node().get_text()
                last_node = connection.get_last_node().get_text()
                link_type = connection.get_link_type()

                processed_nodes.append(first_node)
                processed_nodes.append(last_node)

                out += '{0}	{1}	{2}'.format(first_node, link_type, last_node) + '\n'

            for node in self.__graph_editor.get_all_nodes():
                if node.get_text() not in processed_nodes:
                    processed_nodes.append(node.get_text())

                    out += node.get_text() + '\n'

            with open(filename[0], 'w', encoding='utf-8') as f:
                f.write(out)

    def export_to_owl(self):
        """
        Graph editor's relations export to owl-file
        :return: None
        """

        filename = QFileDialog.getSaveFileName(None, 'Экспорт в OWL', "", "OWL files (*.owl)")

        if len(filename[0]) > 4:

            processed_nodes = []

            triples = []

            test = self.__graph_editor.get_all_connections()

            for connection in self.__graph_editor.get_all_connections():
                first_node = connection.get_first_node().get_text()
                second_node = connection.get_last_node().get_text()
                link_type = connection.get_link_type()

                # pos tagging

                first_node_pos = [item.tag.POS for item in self.__morph.parse(first_node)]
                second_node_pos = [item.tag.POS for item in self.__morph.parse(second_node)]

                if 'NOUN' in first_node_pos and 'NOUN' in second_node_pos:
                    triples.append([first_node, link_type, second_node])

                    processed_nodes.append(first_node)
                    processed_nodes.append(second_node)

            for node in self.__graph_editor.get_all_nodes():
                if node.get_text() not in processed_nodes:
                    # pos tagging
                    node_pos = [item.tag.POS for item in self.__morph.parse(node.get_text())]

                    if 'NOUN' in node_pos:
                        processed_nodes.append(node.get_text())
                        triples.append([node.get_text()])

            owl_exporter = TriplesToOWL(triples)

            owl_exporter.processing()

            owl_exporter.to_file(filename[0])

    def load_diagram_from_graph(self, graph):
        # Read diagram from graph edges

        self.__graph_editor.load_diagram_from_graph(graph)
Esempio n. 6
0
class StatisticalAnalysis:
    """
    Statistical analysis module, which includes:
    - Text classification
    - Creation of semantic core
    - Statistical information about the text
    """
    def __init__(self, text=None):
        self.__text = text

        self.__tokens = None

        self.__morph_analyzer = MorphAnalyzer()

        self.__frequency = None

        self.__category = None

        self.__text_classifier = None

        self.__common_dir = os.getcwd() + '/common/'

        self.train_classifier()

    def analysis(self):
        """
        Complex statistical analysis
        """

        self.__pre_processing()

        unsorted_frequency = {}

        stop_words = []

        with open(self.__common_dir + 'stop-words.txt', 'r', encoding='utf-8') as f:
            stop_words = f.read().split('\n')

        for token in self.__tokens:
            if token not in stop_words:
                unsorted_frequency[token] = self.__word_frequency(token)

        self.__frequency = OrderedDict(sorted(unsorted_frequency.items(), key=lambda x: x[1], reverse=True))

        processed_text = self.__text_cleaner_with_stemming(self.__text)

        self.__category = self.__predict(text=processed_text)

    def set_text(self, text):
        self.__text = text

    def get_words_frequency(self, number=None):
        """
        Returns frequency of all words in the text
        :param number: number of required elements
        :return: dictionary, key - word, value - frequency
        """

        if number is None or number > len(self.__frequency):
            return self.__frequency
        else:
            keys = list(self.__frequency.keys())[:number]
            required = OrderedDict()
            for key in keys:
                required[key] = self.__frequency[key]
            return required

    def get_characters_count(self):
        """
        Returns characters count, including spaces
        """
        return len(self.__text)

    def get_characters_count_without_spaces(self):
        """
        Returns characters count without spaces
        """
        return len(self.__text.replace(" ", ""))

    def get_words_count(self):
        return len(self.__tokens)

    def get_text_category(self):
        """
        Returns the text category obtained using TF-IDF
        """
        return self.get_cyr_category_repr(self.__category)

    def train_classifier(self):
        """
        Loading classifier data and training the classifier
        """
        data = self.__load_classifier_data()

        # Cleaning the training data
        data['text'] = [self.__text_cleaner_with_stemming(t) for t in data['text']]

        # Split the training data
        d = self.__split_training_data(data)

        # Training the classifier
        self.__text_classifier = Pipeline([
            ('hashvect', HashingVectorizer()),
            ('tfidf', TfidfTransformer(use_idf=False)),
            ('clf', SGDClassifier(loss='hinge')),
        ])

        self.__text_classifier.fit(d['train']['x'], d['train']['y'])

    def __pre_processing(self):
        """
        Text tokenization. Text classifier initialization
        """

        blob = TextBlob(self.__text_cleaner(self.__text))
        self.__tokens = list(blob.tokens)

        self.__normalize_tokens()

    def __predict(self, text):
        """
        Predicting of the category of the text
        :param text: source raw text
        :return: one of 13 categories
        """

        source_text = [self.__text_cleaner_with_stemming(text)]
        predicted = self.__text_classifier.predict(source_text)

        str_category = predicted[0]

        text_category = None

        if str_category == 'politics':
            text_category = TextCategory.POLITICS
        elif str_category == 'culture':
            text_category = TextCategory.CULTURE
        elif str_category == 'sport':
            text_category = TextCategory.SPORT
        elif str_category == 'health':
            text_category = TextCategory.HEALTH
        elif str_category == 'tech':
            text_category = TextCategory.TECH
        elif str_category == 'economics':
            text_category = TextCategory.ECONOMICS
        elif str_category == 'incident':
            text_category = TextCategory.INCIDENT
        elif str_category == 'auto':
            text_category = TextCategory.AUTO
        elif str_category == 'woman':
            text_category = TextCategory.WOMAN
        elif str_category == 'advertising':
            text_category = TextCategory.ADVERTISING
        elif str_category == 'social':
            text_category = TextCategory.SOCIAL
        elif str_category == 'realty':
            text_category = TextCategory.REALTY
        elif str_category == 'science':
            text_category = TextCategory.SCIENCE
            
        return text_category

    @staticmethod
    def get_cyr_category_repr(category):
        """
        Returns cyrillic string representation of text category
        :param category: one of TextCategory enumeration
        :return: category string
        """

        if category == TextCategory.POLITICS:
            return "ПОЛИТИКА"
        elif category == TextCategory.CULTURE:
            return "КУЛЬТУРА"
        elif category == TextCategory.SPORT:
            return "СПОРТ"
        elif category == TextCategory.HEALTH:
            return "ЗДОРОВЬЕ"
        elif category == TextCategory.TECH:
            return "ТЕХНОЛОГИИ"
        elif category == TextCategory.ECONOMICS:
            return "ЭКОНОМИКА"
        elif category == TextCategory.INCIDENT:
            return "ИНЦИДЕНТ"
        elif category == TextCategory.AUTO:
            return "ТРАНСПОРТ"
        elif category == TextCategory.WOMAN:
            return "ЖЕНЩИНЫ"
        elif category == TextCategory.ADVERTISING:
            return "РЕКЛАМА"
        elif category == TextCategory.SOCIAL:
            return "СОЦИАЛЬНАЯ СФЕРА"
        elif category == TextCategory.REALTY:
            return "НЕДВИЖИМОСТЬ"
        elif category == TextCategory.SCIENCE:
            return "НАУКА"

    @staticmethod
    def get_category_by_cyr_repr(category):
        """
        Returns text category by cyrillic string representation
        """

        if category == "ПОЛИТИКА":
            return TextCategory.POLITICS
        elif category == "КУЛЬТУРА":
            return TextCategory.CULTURE
        elif category == "СПОРТ":
            return TextCategory.SPORT
        elif category == "ЗДОРОВЬЕ":
            return TextCategory.HEALTH
        elif category == "ТЕХНОЛОГИИ":
            return TextCategory.TECH
        elif category == "ЭКОНОМИКА":
            return TextCategory.ECONOMICS
        elif category == "ИНЦИДЕНТ":
            return TextCategory.INCIDENT
        elif category == "ТРАНСПОРТ":
            return TextCategory.AUTO
        elif category == "ЖЕНЩИНЫ":
            return TextCategory.WOMAN
        elif category == "РЕКЛАМА":
            return TextCategory.ADVERTISING
        elif category == "СОЦИАЛЬНАЯ СФЕРА":
            return TextCategory.SOCIAL
        elif category == "НЕДВИЖИМОСТЬ":
            return TextCategory.REALTY
        elif category == "НАУКА":
            return TextCategory.SCIENCE
        else:
            return None

    @staticmethod
    def get_sql_category_repr(category):
        """
        Returns cyrillic string representation of text category
        :param category: one of TextCategory enumeration
        :return: category string
        """

        if category == TextCategory.POLITICS:
            return "politics"
        elif category == TextCategory.CULTURE:
            return "culture"
        elif category == TextCategory.SPORT:
            return "sport"
        elif category == TextCategory.HEALTH:
            return "health"
        elif category == TextCategory.TECH:
            return "tech"
        elif category == TextCategory.ECONOMICS:
            return "economics"
        elif category == TextCategory.INCIDENT:
            return "incident"
        elif category == TextCategory.AUTO:
            return "auto"
        elif category == TextCategory.WOMAN:
            return "woman"
        elif category == TextCategory.ADVERTISING:
            return "advertising"
        elif category == TextCategory.SOCIAL:
            return "social"
        elif category == TextCategory.REALTY:
            return "realty"
        elif category == TextCategory.SCIENCE:
            return "science"
        else:
            return "unknown"

    def update_classifier_data(self, text, category):
        """
        Update rss-all.sqlite database
        :param text: mew text
        :param category: the category of the text
        """

        category = self.get_sql_category_repr(category)

        db_name = self.__common_dir + 'rss-all.sqlite'

        conn = sqlite3.connect(db_name)
        try:
            c = conn.cursor()

            sql_insert = """
                INSERT INTO data VALUES(null, ?, ?)
            """

            target_text = (text, category)

            c.execute(sql_insert, target_text)
        finally:
            conn.commit()
            conn.close()

            # update classifier
            self.train_classifier()

    def __load_classifier_data(self):
        """
        Loading rss-feed from sqlite database

        :return: dictionary, which contains list of texts and list of their categories
        """

        db_name = self.__common_dir + 'rss-all.sqlite'

        data = {'text': [], 'tag': []}

        conn = sqlite3.connect(db_name)
        try:
            c = conn.cursor()
            for row in c.execute('SELECT * FROM data'):
                data['text'] += [row[1]]
                data['tag'] += [row[2]]
        finally:
            conn.close()

        return data

    @staticmethod
    def __split_training_data(data, validation_split=0.0):
        """
        Split source texts into two parts: training and testing
        :param data: source texts
        :param validation_split: proportions
        :return: dict, which contains training and testing data
        """

        sz = len(data['text'])
        indices = np.arange(sz)
        np.random.shuffle(indices)

        X = [data['text'][i] for i in indices]
        Y = [data['tag'][i] for i in indices]
        nb_validation_samples = int(validation_split * sz)

        return {
            'train': {'x': X[-nb_validation_samples:], 'y': Y[-nb_validation_samples:]},
            'test': {'x': X[:-nb_validation_samples], 'y': Y[:-nb_validation_samples]}
        }

    def __normalize_tokens(self):
        """
        Replace tokens with their normal form
        """
        normalized = []

        for token in self.__tokens:
            target = self.__morph_analyzer.get_best_parse_result(token)
            normalized.append(target.normal_form)

        self.__tokens = normalized

    def __entry_count(self, word):
        """
        Find the number of occurrences of word in the text
        :param word: Target word
        :return: number of occurrences
        """
        count = 0
        for token in self.__tokens:
            if token == word:
                count += 1

        return count

    def __word_frequency(self, word, entry_count=None):
        """
        Find the frequency of source word in the text
        :param word: source word
        :param entry_count: the number of occurrences of word in the text
        :return: frequency (in persents)
        """
        if entry_count is not None:
            return round(100 * entry_count / len(self.__tokens), 2)
        else:
            return round(100 * self.__entry_count(word) / len(self.__tokens), 2)

    @staticmethod
    def __text_cleaner(raw_text):
        """
        Using regexp to clean up the text

        :param raw_text: source text
        :return: clean text
        """

        raw_text = raw_text.lower()  # приведение в lowercase,

        raw_text = re.sub(r'https?://[\S]+', ' url ', raw_text)  # замена интернет ссылок
        raw_text = re.sub(r'[\w\./]+\.[a-z]+', ' url ', raw_text)

        raw_text = re.sub(r'\d+[-/\.]\d+[-/\.]\d+', ' date ', raw_text)  # замена даты и времени
        raw_text = re.sub(r'\d+ ?гг?', ' date ', raw_text)
        raw_text = re.sub(r'\d+:\d+(:\d+)?', ' time ', raw_text)
        raw_text = re.sub(r'@\w+', ' tname ', raw_text)  # замена имён twiter
        raw_text = re.sub(r'#\w+', ' htag ', raw_text)  # замена хештегов

        raw_text = re.sub(r'<[^>]*>', ' ', raw_text)  # удаление html тагов
        raw_text = re.sub(r'[\W]+', ' ', raw_text)  # удаление лишних символов

        stw = ['в', 'по', 'на', 'из', 'и', 'или', 'не', 'но', 'за', 'над', 'под', 'то',
               'a', 'at', 'on', 'of', 'and', 'or', 'in', 'for', 'at']
        remove = r'\b(' + '|'.join(stw) + ')\b'
        raw_text = re.sub(remove, ' ', raw_text)

        raw_text = re.sub(r'\b\w\b', ' ', raw_text)  # удаление отдельно стоящих букв

        raw_text = re.sub(r'\b\d+\b', ' digit ', raw_text)  # замена цифр

        return raw_text

    @staticmethod
    def __text_cleaner_with_stemming(raw_text):
        """
        Using regexp to clean up the text
        Stemming the text

        :param raw_text: source text
        :return: clean text
        """

        raw_text = raw_text.lower()  # приведение в lowercase,

        raw_text = re.sub(r'https?://[\S]+', ' url ', raw_text)  # замена интернет ссылок
        raw_text = re.sub(r'[\w\./]+\.[a-z]+', ' url ', raw_text)

        raw_text = re.sub(r'\d+[-/\.]\d+[-/\.]\d+', ' date ', raw_text)  # замена даты и времени
        raw_text = re.sub(r'\d+ ?гг?', ' date ', raw_text)
        raw_text = re.sub(r'\d+:\d+(:\d+)?', ' time ', raw_text)
        raw_text = re.sub(r'@\w+', ' tname ', raw_text)  # замена имён twiter
        raw_text = re.sub(r'#\w+', ' htag ', raw_text)  # замена хештегов

        raw_text = re.sub(r'<[^>]*>', ' ', raw_text)  # удаление html тагов
        raw_text = re.sub(r'[\W]+', ' ', raw_text)  # удаление лишних символов

        stemmer = Stemmer('russian')

        raw_text = ' '.join(stemmer.stemWords(raw_text.split()))

        stw = ['в', 'по', 'на', 'из', 'и', 'или', 'не', 'но', 'за', 'над', 'под', 'то',
               'a', 'at', 'on', 'of', 'and', 'or', 'in', 'for', 'at']
        remove = r'\b(' + '|'.join(stw) + ')\b'
        raw_text = re.sub(remove, ' ', raw_text)

        raw_text = re.sub(r'\b\w\b', ' ', raw_text)  # удаление отдельно стоящих букв

        raw_text = re.sub(r'\b\d+\b', ' digit ', raw_text)  # замена цифр

        return raw_text
Esempio n. 7
0
class SemanticAnalysisWidget(QWidget):
    def __init__(self, parent=None):
        QWidget.__init__(self, parent)

        self.setMinimumSize(600, 400)

        random.seed(version=2)

        self.__graph_editor = GraphEditor()

        self.__morph = MorphAnalyzer()

        self.__btn_save = QPushButton("Экспорт..", self)

        self.__export_menu = QMenu()
        self.__action_to_owl = self.__export_menu.addAction(
            "Web-онтология (owl-файл)")
        self.__action_to_svg = self.__export_menu.addAction(
            "Файл векторной графики (svg-файл)")
        self.__action_to_cmap = self.__export_menu.addAction(
            "Концептуальная карта (txt-файл)")
        self.__action_to_png = self.__export_menu.addAction("Изображение")

        self.__btn_save.setMenu(self.__export_menu)

        self.__export_menu.triggered[QAction].connect(
            self.__process_export_menu)

        self.__setup_ui()

    def __setup_ui(self):

        vbox_layout = QVBoxLayout()

        self.__btn_save.setMaximumWidth(100)

        vbox_layout.addWidget(self.__btn_save)

        vbox_layout.addWidget(self.__graph_editor)

        self.setLayout(vbox_layout)

    def __process_export_menu(self, menu_item):

        if menu_item.text() == "Web-онтология (owl-файл)":
            self.export_to_owl()
        elif menu_item.text() == "Файл векторной графики (svg-файл)":
            self.export_to_svg()
        elif menu_item.text() == "Концептуальная карта (txt-файл)":
            self.export_to_cmap()
        elif menu_item.text() == "Изображение":
            self.export_to_png()

    def export_to_png(self):
        """
        Graph editor's scene rendering & saving to png-file
        """

        filename = QFileDialog.getSaveFileName(None, 'Сохранить в формате png',
                                               "", "Images (*.png)")

        if len(filename[0]) > 4:

            w = self.__graph_editor.get_diagram_scene().width()
            h = self.__graph_editor.get_diagram_scene().height()

            image = QImage(w, h, QImage.Format_ARGB32_Premultiplied)
            image.fill(Qt.white)
            # pix = QPixmap(w, h)
            painter = QPainter()
            painter.begin(image)
            painter.setRenderHint(QPainter.Antialiasing)
            self.__graph_editor.get_diagram_scene().render(painter)
            painter.end()

            image.save(filename[0])

    def export_to_svg(self):
        """
        Graph editor's scene rendering & saving to svg-file
        :return: None
        """

        filename = QFileDialog.getSaveFileName(None, 'Сохранить SVG-граф', "",
                                               "svg files (*.svg)")

        if len(filename[0]) > 4:

            svg = QSvgGenerator()
            svg.setFileName(filename[0])

            w = self.__graph_editor.get_diagram_scene().width()
            h = self.__graph_editor.get_diagram_scene().height()

            svg.setSize(QSize(w, h))
            svg.setViewBox(QRect(0, 0, w, h))
            svg.setTitle('Semantic Graph')
            svg.setDescription('File created by RTA')

            painter = QPainter()
            painter.begin(svg)
            self.__graph_editor.get_diagram_scene().render(painter)
            painter.end()

    def export_to_cmap(self):
        """
        Graph editor's relations saving to txt-file
        :return: None
        """

        filename = QFileDialog.getSaveFileName(None, 'Экспорт в CMap', "",
                                               "Text files (*.txt)")

        if len(filename[0]) > 4:
            out = ""

            processed_nodes = []

            for connection in self.__graph_editor.get_all_connections():
                first_node = connection.get_first_node().get_text()
                last_node = connection.get_last_node().get_text()
                link_type = connection.get_link_type()

                processed_nodes.append(first_node)
                processed_nodes.append(last_node)

                out += '{0}	{1}	{2}'.format(first_node, link_type,
                                            last_node) + '\n'

            for node in self.__graph_editor.get_all_nodes():
                if node.get_text() not in processed_nodes:
                    processed_nodes.append(node.get_text())

                    out += node.get_text() + '\n'

            with open(filename[0], 'w', encoding='utf-8') as f:
                f.write(out)

    def export_to_owl(self):
        """
        Graph editor's relations export to owl-file
        :return: None
        """

        filename = QFileDialog.getSaveFileName(None, 'Экспорт в OWL', "",
                                               "OWL files (*.owl)")

        time_stamp = time.time()

        if len(filename[0]) > 4:

            processed_nodes = []

            triples = []

            for connection in self.__graph_editor.get_all_connections():
                first_node = connection.get_first_node().get_text()
                second_node = connection.get_last_node().get_text()
                link_type = connection.get_link_type()

                # pos tagging

                first_node_pos = [
                    item.tag.POS for item in self.__morph.parse(first_node)
                ]
                second_node_pos = [
                    item.tag.POS for item in self.__morph.parse(second_node)
                ]

                if 'NOUN' in first_node_pos and 'NOUN' in second_node_pos:
                    triples.append([first_node, link_type, second_node])

                    processed_nodes.append(first_node)
                    processed_nodes.append(second_node)

            for node in self.__graph_editor.get_all_nodes():
                if node.get_text() not in processed_nodes:
                    # pos tagging
                    node_pos = [
                        item.tag.POS
                        for item in self.__morph.parse(node.get_text())
                    ]

                    if 'NOUN' in node_pos:
                        processed_nodes.append(node.get_text())
                        triples.append([node.get_text()])

            owl_exporter = TriplesToOWL(triples)

            owl_exporter.processing()

            owl_exporter.to_file(filename[0])

        print("Finished: time - {}".format((time.time() - time_stamp)))

    def load_diagram_from_graph(self, graph):
        # Read diagram from graph edges

        self.__graph_editor.load_diagram_from_graph(graph)
Esempio n. 8
0
class LinguisticAnalysisWidget(QWidget):
    def __init__(self, content=None, parent=None):
        super(LinguisticAnalysisWidget, self).__init__(parent)

        self.setWindowTitle("Лингвистический анализатор")

        self.__morph = MorphAnalyzer()

        self.__line_edit = QLineEdit()
        self.__push_button = QPushButton("Анализ")

        self.__linguistic = QLabel()
        self.__all_results = QLabel()

        self.setup_ui()

    def setup_ui(self):

        regex = QRegExp("[а-я-А-Я]+")

        self.__line_edit.setValidator(QRegExpValidator(regex))

        label_style = """
                    background-color: rgb(150, 210, 57);
                    border-radius: 5px;
                    padding: 5px;
                    color: white;
                    border: 2px solid rgb(150, 210, 57);
                """

        self.__linguistic.setStyleSheet(label_style)
        self.__all_results.setStyleSheet(label_style)

        self.__linguistic.setText(
            '<table class="tg">\n\t<tr>\t\t<th><b>Результаты разбора: </b></th></tr></table>'
        )

        self.__push_button.clicked.connect(self.__processing)

        # setup layout

        hbox_layout = QHBoxLayout()

        spacer = QSpacerItem(10, 40, QSizePolicy.Minimum,
                             QSizePolicy.Expanding)

        hbox_layout.addWidget(self.__line_edit)
        hbox_layout.addItem(spacer)
        hbox_layout.addWidget(self.__push_button)

        vbox_layout = QVBoxLayout()

        vbox_layout.addItem(hbox_layout)
        vbox_layout.addWidget(self.__linguistic)
        vbox_layout.addWidget(self.__all_results)
        self.__all_results.hide()

        self.setLayout(vbox_layout)

    def __processing(self):
        """
        Processing linguistic analysis
        """

        self.__all_results.hide()

        target_word = self.__line_edit.text()

        table = '<table class="tg">\n\t<tr>\t\t<th><b>Результаты разбора: </b></th></tr>'
        tr = "\n<tr>\n\t<td>{0}:</td>\n\t<td> {1}</td>\n</tr>"

        parsing_result = self.__morph.parse(target_word)

        analysis_data = {}

        best_result = self.__morph.get_best_parse_result(target_word)

        analysis_data['Часть речи'] = self.__morph.lat2cyr(best_result.tag.POS)

        if best_result.normal_form is not None:
            analysis_data['Нормальная форма'] = best_result.normal_form

        if best_result.tag.animacy is not None:
            analysis_data['Одушевлённость'] = self.__morph.lat2cyr(
                best_result.tag.animacy)

        if best_result.tag.aspect is not None:
            analysis_data['Вид'] = self.__morph.lat2cyr(best_result.tag.aspect)

        if best_result.tag.case is not None:
            analysis_data['Падеж'] = self.__morph.lat2cyr(best_result.tag.case)

        if best_result.tag.gender is not None:
            analysis_data['Род'] = self.__morph.lat2cyr(best_result.tag.gender)

        if best_result.tag.mood is not None:
            analysis_data['Наклонение'] = self.__morph.lat2cyr(
                best_result.tag.mood)

        if best_result.tag.number is not None:
            analysis_data['Число'] = self.__morph.lat2cyr(
                best_result.tag.number)

        if best_result.tag.person is not None:
            analysis_data['Лицо'] = self.__morph.lat2cyr(
                best_result.tag.person)

        if best_result.tag.tense is not None:
            analysis_data['Время'] = self.__morph.lat2cyr(
                best_result.tag.tense)

        if best_result.tag.transitivity is not None:
            analysis_data['Переходность'] = self.__morph.lat2cyr(
                best_result.tag.transitivity)

        if best_result.tag.voice is not None:
            analysis_data['Залог'] = self.__morph.lat2cyr(
                best_result.tag.voice)

        for category in analysis_data:
            table += tr.format(category, analysis_data[category])

        table += "\n</table>"

        another_results = '<table class="tg">\n\t<tr>\t\t<th><b>Другие результаты: </b></th></tr>'

        for item in parsing_result:
            another_results += "\n<tr>\n\t<td>{0} : </td><td>{1}</td>\n</tr>".format(
                item.word, self.__morph.lat2cyr(item.tag.cyr_repr))

        another_results += "\n</table>"

        if len(parsing_result) > 1:
            self.__all_results.setText(another_results)
            self.__all_results.show()

        self.__linguistic.setText(table)
Esempio n. 9
0
class MorphologicalAnalysis:
    def __init__(self, text=None, tokens=None):
        self.__text = text

        self.__tokens = tokens

        self.__morph_analyzer = MorphAnalyzer()

        self.__doc = None

    def __pre_processing(self):

        if self.__tokens is None:
            graphematic_analyzer = GraphematicalAnalysis(text=self.__text)
            graphematic_analyzer.analysis()
            self.__tokens = graphematic_analyzer.get_tokens()
            self.__doc = graphematic_analyzer.get_document()

    def analysis(self):

        self.__pre_processing()

        # Add morph descriptors for each token in the text
        for token in self.__tokens:

            token_labels = token.get_labels()

            if Label.CYRIL in token_labels:
                result = self.__morph_analyzer.parse(token.get_text())

                # Get result with best score
                result = self.get_parse_by_score(result)

                pos_tag = result.tag.POS

                # Get morph descriptor from tag.POS
                token.set_morph(self.pos_to_morph_label(pos_tag))

                # Get cyril representation of tag
                token.set_morph_cyr(result.tag.cyr_repr)

            else:
                token.set_morph(Morph.OTHER)
                token.set_morph_cyr("Не русская лексема")

        return self.__tokens

    @staticmethod
    def get_parse_by_score(target):
        max_score = target[0].score
        parse_number = 0

        for i in range(1, len(target)):
            if target[i].score > max_score:
                max_score = target[i].score
                parse_number = i

        return target[parse_number]

    @staticmethod
    def pos_to_morph_label(pos):
        if pos == 'NOUN':
            return Morph.NOUN
        elif pos == 'NPRO':
            return Morph.NPRO
        elif pos == 'NUMR':
            return Morph.NUMR
        elif pos == 'ADJF':
            return Morph.ADJF
        elif pos == 'ADJS':
            return Morph.ADJS
        elif pos == 'COMP':
            return Morph.COMP
        elif pos == 'VERB':
            return Morph.VERB
        elif pos == 'INFN':
            return Morph.INFN
        elif pos == 'PRTF':
            return Morph.PRTF
        elif pos == 'PRTS':
            return Morph.PRTS
        elif pos == 'GRND':
            return Morph.GRND
        elif pos == 'ADVB':
            return Morph.ADVB
        elif pos == 'PRED':
            return Morph.PRED
        elif pos == 'PREP':
            return Morph.PREP
        elif pos == 'CONJ':
            return Morph.CONJ
        elif pos == 'PRCL':
            return Morph.PRCL
        elif pos == 'INTJ':
            return Morph.INTJ
        elif pos == 'LATN':
            return Morph.LATN
        else:
            return Morph.OTHER

    @staticmethod
    def index_of_any(source, dictionary):
        for i in range(0, len(source)):
            if source[i] not in dictionary:
                return False
        return True

    @staticmethod
    def intersects(source, dictionary):
        for i in range(0, len(source)):
            if source[i] in dictionary:
                return True
        return False

    def set_text(self, text):
        self.__text = text

    def set_tokens(self, tokens):
        self.__tokens = tokens

    def get_tokens(self):
        # self.__tokens.pop()
        return self.__tokens

    def get_emails(self):
        return self.__emails

    def get_links(self):
        return self.__links

    def get_hash_tags(self):
        return self.__hash_tags

    def get_document(self):
        # Update tokens
        self.__doc.set_tokens(self.__tokens)

        return self.__doc