Exemple #1
0
class GraphematicalAnalysis:
    def __init__(self, text=None):
        self.__text = text

        self.__blob = None

        self.__tokens = []

        self.__DEL = [' ', '  ', '    ', '\t', '\n']
        self.__SIG = [
            '.', ',', '-', '—', '!', '?', ';', ':', '(', ')', '[', ']', '{',
            '}'
        ]
        self.__SYM = ['«', '»', '\"', '\"', '\"', '``', '\'\'']
        self.__RLE = [
            'й', 'ц', 'у', 'к', 'е', 'н', 'г', 'ш', 'щ', 'з', 'х', 'ъ', 'ф',
            'ы', 'в', 'а', 'п', 'р', 'о', 'л', 'д', 'ж', 'э', 'я', 'ч', 'с',
            'м', 'и', 'т', 'ь', 'б', 'ю', 'ё', 'Й', 'Ц', 'У', 'К', 'Е', 'Н',
            'Г', 'Ш', 'Щ', 'З', 'Х', 'Ъ', 'Ф', 'Ы', 'В', 'А', 'П', 'Р', 'О',
            'Л', 'Д', 'Ж', 'Э', 'Я', 'Ч', 'С', 'М', 'И', 'Т', 'Ь', 'Б', 'Ю',
            'Ё'
        ]

        self.__LLE = [
            'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', 'a', 's', 'd',
            'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm',
            'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', 'A', 'S', 'D',
            'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M'
        ]

        self.__DC = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
        self.__END = ['.', '!', '?']

        self.__emails = []
        self.__hash_tags = []
        self.__links = []

        self.__tokenization_result = []

        self.__doc = None

        self.__regexp_dir = os.getcwd() + '/common/'

    def __pre_processing(self):

        # Clear all previous results

        self.__tokens = []
        self.__tokenization_result = []

        # Extract all of emails & replace them with '__EMAIL'
        self.__emails = self.extract_email_addresses(self.__text)
        for email in self.__emails:
            self.__text = self.__text.replace(email, '__EMAIL')

        # Extract all of hashtags & replace them with '__HASHTAG'
        self.__hash_tags = self.extract_hash_tags(self.__text)
        for tag in self.__hash_tags:
            self.__text = self.__text.replace(tag, '__HASHTAG')

        # Extract all of links & replace them with '__LINK'
        self.__links = self.extract_links(self.__text)
        for link in self.__links:
            self.__text = self.__text.replace(link, '__LINK')

        self.__blob = TextBlob(self.__text)

        # Replacing quotes like '``' with "\""
        for token in self.__blob.tokens:
            new_token = str(token)
            if new_token.startswith('``'):
                new_token = "\""
            elif new_token.endswith('\'\''):
                new_token = "\""
            self.__tokenization_result.append(new_token)

        # Removing unicode special character in first token
        # self.__tokenization_result[0] = self.__tokenization_result[0][1:]

        quotes = OrderedDict()

        # Searching for quotes, deleting them & remember their positions
        for i in range(0, len(self.__tokenization_result)):
            s = str(self.__tokenization_result[i])
            if self.__tokenization_result[i].startswith(
                    "«") or self.__tokenization_result[i].startswith("\""):

                if len(self.__tokenization_result[i]) > 2:
                    # Remember first symbol
                    quotes[i + len(quotes)] = self.__tokenization_result[i][0]

                    # Delete first symbol
                    self.__tokenization_result[i] = self.__tokenization_result[
                        i][1:]

            if self.__tokenization_result[i].endswith(
                    "»") or self.__tokenization_result[i].endswith("\""):

                if len(self.__tokenization_result[i]) > 2:
                    # Remember last symbol
                    quotes[i + len(quotes) +
                           1] = self.__tokenization_result[i][
                               len(self.__tokenization_result[i]) - 1]

                    # Delete last symbol
                    self.__tokenization_result[i] = self.__tokenization_result[
                        i][0:-1]

        # Inserting quotes as individual q
        for key in quotes.keys():
            self.__tokenization_result.insert(key, quotes[key])

    def analysis(self):

        self.__pre_processing()

        current_email = 0
        current_hash_tag = 0
        current_link = 0

        # Add descriptors & labels for each token in the text
        for raw_token in self.__tokenization_result:

            if raw_token == '__EMAIL':
                raw_token = self.__emails[current_email]
                current_email += 1
            elif raw_token == '__HASHTAG':
                raw_token = self.__hash_tags[current_hash_tag]
                current_hash_tag += 1
            elif raw_token == '__LINK':
                raw_token = self.__links[current_link]
                current_link += 1

            if self.index_of_any(raw_token, self.__DEL):
                # Delimiter
                token = Token(text=raw_token, grapheme=Grapheme.DEL)

                # labels
                token.add_label(Label.SPACE)

                self.__tokens.append(token)

            elif self.index_of_any(raw_token, self.__RLE):
                # Russian lexeme
                token = Token(text=raw_token, grapheme=Grapheme.RLE)

                # labels
                token.add_label(Label.WORD)
                token.add_label(Label.CYRIL)

                self.__tokens.append(token)

            elif self.index_of_any(raw_token, self.__SYM):
                # Symbol
                token = Token(text=raw_token, grapheme=Grapheme.SYM)

                # labels
                token.add_label(Label.QUOTE)
                token.add_label(Label.MARKUP)

                if raw_token == "«" or raw_token == "\"":
                    token.add_label(Label.OPENING)
                elif raw_token == "»" or raw_token == "\"":
                    token.add_label(Label.CLOSING)

                self.__tokens.append(token)

            elif self.index_of_any(raw_token, self.__LLE):
                # Latin lexeme
                token = Token(text=raw_token, grapheme=Grapheme.LLE)

                # labels
                token.add_label(Label.WORD)
                token.add_label(Label.LATIN)

                self.__tokens.append(token)

            elif self.index_of_any(raw_token, self.__DC):
                # Digits complex
                token = Token(text=raw_token, grapheme=Grapheme.DC)

                # labels
                token.add_label(Label.NUMBER)

                self.__tokens.append(token)

            else:
                if self.index_of_any(raw_token, self.__SIG):
                    # Signum
                    token = Token(text=raw_token, grapheme=Grapheme.SIG)

                    # labels
                    token.add_label(Label.PUNCT)

                    if raw_token == "(" or raw_token == "[" or raw_token == '{':
                        token.add_label(Label.OPENING)
                    elif raw_token == ")" or raw_token == "]" or raw_token == '}':
                        token.add_label(Label.CLOSING)

                    self.__tokens.append(token)
                else:
                    # Composite token TODO: add #hashtag, email, phone labels
                    token = Token(text=raw_token, grapheme=Grapheme.COMPOSITE)

                    # labels
                    if raw_token in self.__emails:
                        token.add_label(Label.EMAIL)
                    elif raw_token in self.__hash_tags:
                        token.add_label(Label.HASHTAG)
                    elif raw_token in self.__links:
                        token.add_label(Label.LINK)
                    elif self.is_word_with_a_hyphen(raw_token):
                        token.add_label(Label.WORD)
                        token.add_label(Label.CYRIL)
                    else:
                        token.add_label(Label.OTHER)

                    self.__tokens.append(token)

            space_token = Token(text=" ", grapheme=Grapheme.DEL)
            space_token.add_label(Label.SPACE)

            # self.__tokens.append(space_token)

        return self.__tokens

    @staticmethod
    def index_of_any(source, dictionary):
        for i in range(0, len(source)):
            if source[i] not in dictionary:
                return False
        return True

    @staticmethod
    def intersects(source, dictionary):
        for i in range(0, len(source)):
            if source[i] in dictionary:
                return True
        return False

    @staticmethod
    def extract_email_addresses(string):

        r = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-z]{2,5})")
        return r.findall(string)

    @staticmethod
    def is_word_with_a_hyphen(string):

        r = re.findall(r'[\w-]+[\w-]', string)
        return len(r) > 0

    def extract_links(self, string):

        with open(self.__regexp_dir + 'link_regexp.txt', 'r') as f:
            pattern = f.read()

        r = re.compile(pattern)
        return r.findall(string)

    @staticmethod
    def extract_hash_tags(string):
        r = re.compile(r'#\w*')
        return r.findall(string)

    def set_text(self, text):
        self.__text = text

    def get_tokens(self):
        # self.__tokens.pop()
        return self.__tokens

    def get_emails(self):
        return self.__emails

    def get_links(self):
        return self.__links

    def get_hash_tags(self):
        return self.__hash_tags

    def get_document(self):

        self.__doc = Document(text=self.__text)

        self.__doc.set_emails(self.__emails)
        self.__doc.set_links(self.__links)
        self.__doc.set_hash_tags(self.__hash_tags)

        self.__doc.set_tokens(self.__tokens)

        self.__doc.set_sentences(list(self.__blob.sentences))

        return self.__doc