class WikiMarkupParser(object):
    """
    Parser to remove all kinds of wiki markup tags from an object
    """

    def __init__(self):
        """
        Constructor
        """
        self.string = ""
        # all the following regex remove all tags that cannot be rendered
        # in text
        self.wiki_re = re.compile(
            r"""\[{2}(File|Category):[\s\S]+\]{2}|
                                        [\s\w#()]+\||
                                        (\[{2}|\]{2})|
                                        \'{2,5}|
                                        (<s>|<!--)[\s\S]+(</s>|-->)|
                                        {{[\s\S]+}}|
                                        ^={1,6}|={1,6}$""",
            re.X,
        )

        self.regex_handler = RegexHandler()

    def __list(self, listmatch):
        return " " * (len(listmatch.group()) - 1) + "*"

    def __parse(self, string=""):
        """
        Parse a string to remove and replace all wiki markup tags
        """
        self.string = string
        self.string = self.wiki_re.sub("", self.string)
        # search for lists
        self.listmatch = re.search("^(\*+)", self.string)

        if self.listmatch:
            self.string = self.__list(self.listmatch) + re.sub("^(\*+)", "", self.string)
        self.string = self.regex_handler.clean_wikipedia_article(self.string)

        return self.string

    # def parse_string(self, string=''):
    #     self.strings = string.split(".")
    #     self.strings = [self.__parse(line) for line in self.strings]
    #     return '.'.join(self.strings)

    # method to clean wikipedia markup using wiki_extractor clean method
    def parse_string(self, string=""):
        result = self.regex_handler.clean_wikipedia_article(string)  # wiki_extractor
        result = re.sub(r"\(.*?\)", "", result)  # plain regex
        # result = re.sub(r'\)','',result)
        return result

    def parse_byte(self, byte=None):
        pass

    def parse_file(self, file=None):
        pass
Example #2
0
class WikiMarkupParser(object):
    '''
    Parser to remove all kinds of wiki markup tags from an object
    '''
    def __init__(self):
        '''
        Constructor
        '''
        self.string = ''
        # all the following regex remove all tags that cannot be rendered
        # in text
        self.wiki_re = re.compile(
            r"""\[{2}(File|Category):[\s\S]+\]{2}|
                                        [\s\w#()]+\||
                                        (\[{2}|\]{2})|
                                        \'{2,5}|
                                        (<s>|<!--)[\s\S]+(</s>|-->)|
                                        {{[\s\S]+}}|
                                        ^={1,6}|={1,6}$""", re.X)

        self.regex_handler = RegexHandler()

    def __list(self, listmatch):
        return ' ' * (len(listmatch.group()) - 1) + '*'

    def __parse(self, string=''):
        '''
        Parse a string to remove and replace all wiki markup tags
        '''
        self.string = string
        self.string = self.wiki_re.sub('', self.string)
        # search for lists
        self.listmatch = re.search('^(\*+)', self.string)

        if self.listmatch:
            self.string = self.__list(self.listmatch) + re.sub('^(\*+)', \
                          '', self.string)
        self.string = self.regex_handler.clean_wikipedia_article(self.string)

        return self.string

    # def parse_string(self, string=''):
    #     self.strings = string.split(".")
    #     self.strings = [self.__parse(line) for line in self.strings]
    #     return '.'.join(self.strings)

    # method to clean wikipedia markup using wiki_extractor clean method
    def parse_string(self, string=''):
        result = self.regex_handler.clean_wikipedia_article(
            string)  # wiki_extractor
        result = re.sub(r'\(.*?\)', '', result)  # plain regex
        # result = re.sub(r'\)','',result)
        return result

    def parse_byte(self, byte=None):
        pass

    def parse_file(self, file=None):
        pass
Example #3
0
 def getNegativeSamples(self, indices):
     buff = ""
     r = RegexHandler()
     _fh = open("neg_samples", "w")
     _fh.close()
     for index in indices:
         line = linecache.getline(self.negativeSource, index)
         line = r.get_clean_line_for_sampling(line)
         buff = line + "\n"
         negative_sampling = open("neg_samples", "a")
         negative_sampling.write(buff)
         negative_sampling.close()
 def getNegativeSamples(self, indices):
     buff=""
     r = RegexHandler()
     _fh = open("neg_samples","w")
     _fh.close()
     for index in indices:
         line = linecache.getline(self.negativeSource,index)
         line = r.get_clean_line_for_sampling(line)
         buff = line+"\n"
         negative_sampling=open("neg_samples","a")
         negative_sampling.write(buff)
         negative_sampling.close()
Example #5
0
    def __init__(self):
        '''
        Constructor
        '''
        self.string = ''
        # all the following regex remove all tags that cannot be rendered
        # in text
        self.wiki_re = re.compile(
            r"""\[{2}(File|Category):[\s\S]+\]{2}|
                                        [\s\w#()]+\||
                                        (\[{2}|\]{2})|
                                        \'{2,5}|
                                        (<s>|<!--)[\s\S]+(</s>|-->)|
                                        {{[\s\S]+}}|
                                        ^={1,6}|={1,6}$""", re.X)

        self.regex_handler = RegexHandler()
    def extract_definitions(self, raw, word_type):
        start_index = raw.find(word_type, 0, len(raw)) + len(word_type)
        end_index = raw.find(self.synonym_pattern, 0, len(raw))

        def_raw = raw[start_index:end_index]

        items_list = def_raw.split("\n")
        result_list = []

        # get instance of wikimarkup parser to clean
        _wiki_parser = RegexHandler()

        for item in items_list:
            if item[0:2] == "# ":
                clean_def = _wiki_parser.wiktionary_markup_cleanup(item[2:]).strip()
                if clean_def != "":
                    result_list.append(clean_def)
            if item[0:3] == "## ":
                clean_def = _wiki_parser.wiktionary_markup_cleanup(item[3:]).strip()
                if clean_def != "":
                    result_list.append(clean_def)
        return result_list
    def __init__(self):
        """
        Constructor
        """
        self.string = ""
        # all the following regex remove all tags that cannot be rendered
        # in text
        self.wiki_re = re.compile(
            r"""\[{2}(File|Category):[\s\S]+\]{2}|
                                        [\s\w#()]+\||
                                        (\[{2}|\]{2})|
                                        \'{2,5}|
                                        (<s>|<!--)[\s\S]+(</s>|-->)|
                                        {{[\s\S]+}}|
                                        ^={1,6}|={1,6}$""",
            re.X,
        )

        self.regex_handler = RegexHandler()
 def transform(self, text):
     r = RegexHandler()
     # basically get the string printable on console
     result = r.get_alphanumeric(text)
     return result
Example #9
0
 def transform(self, text):
     r = RegexHandler()
     # basically get the string printable on console
     result = r.get_alphanumeric(text)
     return result
Example #10
0
 def transform(self, text):
     r = RegexHandler()
     # basically get the string printable on console
     result = r.give_printable_string(text)
     return result