class WikiMarkupParser(object): """ Parser to remove all kinds of wiki markup tags from an object """ def __init__(self): """ Constructor """ self.string = "" # all the following regex remove all tags that cannot be rendered # in text self.wiki_re = re.compile( r"""\[{2}(File|Category):[\s\S]+\]{2}| [\s\w#()]+\|| (\[{2}|\]{2})| \'{2,5}| (<s>|<!--)[\s\S]+(</s>|-->)| {{[\s\S]+}}| ^={1,6}|={1,6}$""", re.X, ) self.regex_handler = RegexHandler() def __list(self, listmatch): return " " * (len(listmatch.group()) - 1) + "*" def __parse(self, string=""): """ Parse a string to remove and replace all wiki markup tags """ self.string = string self.string = self.wiki_re.sub("", self.string) # search for lists self.listmatch = re.search("^(\*+)", self.string) if self.listmatch: self.string = self.__list(self.listmatch) + re.sub("^(\*+)", "", self.string) self.string = self.regex_handler.clean_wikipedia_article(self.string) return self.string # def parse_string(self, string=''): # self.strings = string.split(".") # self.strings = [self.__parse(line) for line in self.strings] # return '.'.join(self.strings) # method to clean wikipedia markup using wiki_extractor clean method def parse_string(self, string=""): result = self.regex_handler.clean_wikipedia_article(string) # wiki_extractor result = re.sub(r"\(.*?\)", "", result) # plain regex # result = re.sub(r'\)','',result) return result def parse_byte(self, byte=None): pass def parse_file(self, file=None): pass
class WikiMarkupParser(object): ''' Parser to remove all kinds of wiki markup tags from an object ''' def __init__(self): ''' Constructor ''' self.string = '' # all the following regex remove all tags that cannot be rendered # in text self.wiki_re = re.compile( r"""\[{2}(File|Category):[\s\S]+\]{2}| [\s\w#()]+\|| (\[{2}|\]{2})| \'{2,5}| (<s>|<!--)[\s\S]+(</s>|-->)| {{[\s\S]+}}| ^={1,6}|={1,6}$""", re.X) self.regex_handler = RegexHandler() def __list(self, listmatch): return ' ' * (len(listmatch.group()) - 1) + '*' def __parse(self, string=''): ''' Parse a string to remove and replace all wiki markup tags ''' self.string = string self.string = self.wiki_re.sub('', self.string) # search for lists self.listmatch = re.search('^(\*+)', self.string) if self.listmatch: self.string = self.__list(self.listmatch) + re.sub('^(\*+)', \ '', self.string) self.string = self.regex_handler.clean_wikipedia_article(self.string) return self.string # def parse_string(self, string=''): # self.strings = string.split(".") # self.strings = [self.__parse(line) for line in self.strings] # return '.'.join(self.strings) # method to clean wikipedia markup using wiki_extractor clean method def parse_string(self, string=''): result = self.regex_handler.clean_wikipedia_article( string) # wiki_extractor result = re.sub(r'\(.*?\)', '', result) # plain regex # result = re.sub(r'\)','',result) return result def parse_byte(self, byte=None): pass def parse_file(self, file=None): pass
def getNegativeSamples(self, indices): buff = "" r = RegexHandler() _fh = open("neg_samples", "w") _fh.close() for index in indices: line = linecache.getline(self.negativeSource, index) line = r.get_clean_line_for_sampling(line) buff = line + "\n" negative_sampling = open("neg_samples", "a") negative_sampling.write(buff) negative_sampling.close()
def getNegativeSamples(self, indices): buff="" r = RegexHandler() _fh = open("neg_samples","w") _fh.close() for index in indices: line = linecache.getline(self.negativeSource,index) line = r.get_clean_line_for_sampling(line) buff = line+"\n" negative_sampling=open("neg_samples","a") negative_sampling.write(buff) negative_sampling.close()
def __init__(self): ''' Constructor ''' self.string = '' # all the following regex remove all tags that cannot be rendered # in text self.wiki_re = re.compile( r"""\[{2}(File|Category):[\s\S]+\]{2}| [\s\w#()]+\|| (\[{2}|\]{2})| \'{2,5}| (<s>|<!--)[\s\S]+(</s>|-->)| {{[\s\S]+}}| ^={1,6}|={1,6}$""", re.X) self.regex_handler = RegexHandler()
def extract_definitions(self, raw, word_type): start_index = raw.find(word_type, 0, len(raw)) + len(word_type) end_index = raw.find(self.synonym_pattern, 0, len(raw)) def_raw = raw[start_index:end_index] items_list = def_raw.split("\n") result_list = [] # get instance of wikimarkup parser to clean _wiki_parser = RegexHandler() for item in items_list: if item[0:2] == "# ": clean_def = _wiki_parser.wiktionary_markup_cleanup(item[2:]).strip() if clean_def != "": result_list.append(clean_def) if item[0:3] == "## ": clean_def = _wiki_parser.wiktionary_markup_cleanup(item[3:]).strip() if clean_def != "": result_list.append(clean_def) return result_list
def __init__(self): """ Constructor """ self.string = "" # all the following regex remove all tags that cannot be rendered # in text self.wiki_re = re.compile( r"""\[{2}(File|Category):[\s\S]+\]{2}| [\s\w#()]+\|| (\[{2}|\]{2})| \'{2,5}| (<s>|<!--)[\s\S]+(</s>|-->)| {{[\s\S]+}}| ^={1,6}|={1,6}$""", re.X, ) self.regex_handler = RegexHandler()
def transform(self, text): r = RegexHandler() # basically get the string printable on console result = r.get_alphanumeric(text) return result
def transform(self, text): r = RegexHandler() # basically get the string printable on console result = r.give_printable_string(text) return result