def testMathRemoval(self): markup1 = "If Z<sub>1</sub>, ..., ''Z''<sub>''k''</sub> are" expect1 = "If _inline_math_, ..., _inline_math_ are" markup2 = " (4 × 10<sup>12</sup> watts" expect2 = " (4 × _inline_math_ watts" self.assertEqual(unwiki.loads(markup1), expect1) self.assertEqual(unwiki.loads(markup2), expect2)
def testRefRemoval(self): markup1 = 'the best of a nation.<ref name="AdvisoryCommittee" /> In this way' expect1 = "the best of a nation. In this way" self.assertEqual(unwiki.loads(markup1), expect1) markup2 = """[[Jacques Le Goff]]<ref name="Le Goff">Le Goff, Jacques. ''La civilisation de l'Occident médieval''. Paris. 1964; English translation (1988): ''Medieval Civilization'', {{ISBN|0-631-17566-0}} – "translatio imperii" is discussed in Part II, Chapter VI, section on "Time, eternity and history".</ref> describes""" expect2 = """Jacques Le Goff describes""" self.assertEqual(unwiki.loads(markup2), expect2)
def testREFTagIsConsumedCorrectly(self): markup1 = "hi <ref I should not see this/> And I should see this <ref> this not</ref>" expect1 = "hi And I should see this " markup2 = "Now <ref>Remove This</ref> and forget <ref about this/>" expect2 = "Now and forget " self.assertEqual(unwiki.loads(markup1), expect1) self.assertEqual(unwiki.loads(markup2), expect2)
def testInfobox(self): self.assertEqual(unwiki.loads('{{Infobox none}} None'), ' None') self.assertEqual(unwiki.loads('{{foo bar}}'), '') self.assertEqual(unwiki.loads("""{{foo\nbar}}"""), '') self.assertEqual(unwiki.loads("""{{Infobox foo}} None"""), ' None')
def testHTMLspaces(self): markup1 = "Let ''X'' be a non-negative integer and ''n'' " expect1 = "Let X be a non-negative integer and n " self.assertEqual(unwiki.loads(markup1), expect1) markup2 = "this should be a <; and a >" expect2 = "this should be a <; and a >" self.assertEqual(unwiki.loads(markup2), expect2)
def testNestedCurlyBracketRemoval(self): markup1 = ''' Trying out {{the removal {{nested curly brackets}}}}''' expect1 = ' Trying out ' markup2 = ''' Trying out {{the removal {{nested curly brackets}} this is looking pretty good }}''' expect2 = ' Trying out ' markup3 = ''' Trying out If {{nowrap|log\u2009\'\'f\'\'(\'\'x\'\'; \'\'θ\'\')}} is {{nowrap| log θ the removal }}''' expect3 = ' Trying out If is ' self.assertEqual(unwiki.loads(markup1), expect1) self.assertEqual(unwiki.loads(markup2), expect2) self.assertEqual(unwiki.loads(markup3), expect3)
def writeQuotes(content): global langArg global cutoffArg quoteList = [] write = False i = 0 while i < len(content): line = content[i] if line.startswith('==') and line[2] != "=": write = False if write and line.startswith('* ') and len(line) < (cutoffArg + 3): # would optimize, but since the program only needs to be run once, not really a priority cleaned_line = unwiki.loads(line) + '\n' cleaned_line = multireplace(cleaned_line, {"\\u2018": "'", "\\u2019": "'", "\\u2026": "...", "\\u2013": "-", "\\u2014": "-", "\\u201c": '"', "\\u201d": '"', "\\'": "'", "'''": "", "\n": ""}) cleaned_line = re.sub(r"<.*>|'('+)|\\\\x..|\\u....", "", cleaned_line) cleaned_line = re.sub(r' +', ' ', cleaned_line) cleaned_line = cleaned_line[2:] if (detect(cleaned_line) == langArg and "://" not in cleaned_line): quoteList.append(cleaned_line) if line == '==Quotes==' or line == '== Quotes ==': write = True i += 1 return quoteList
def testMath(self): markup1 = "the field {{math|'''R'''}} of real numbers" expect1 = "the field _inline_math_ of real numbers" self.assertEqual(unwiki.loads(markup1), expect1) markup2 = "the field {{ math |'''R'''}} of real numbers" expect2 = "the field _inline_math_ of real numbers" self.assertEqual(unwiki.loads(markup2), expect2) # Check the same for the mvar teplate markup1 = "the field {{mvar|'''R'''}} of real numbers" expect1 = "the field _inline_math_ of real numbers" self.assertEqual(unwiki.loads(markup1), expect1) markup2 = "the field {{ mvar |'''R'''}} of real numbers" expect2 = "the field _inline_math_ of real numbers" self.assertEqual(unwiki.loads(markup2), expect2) # math tags markup3 = "with a [[norm (mathematics)|norm]] <math>\|\cdot\|_X</math>" expect3 = "with a norm _inline_math_" self.assertEqual(unwiki.loads(markup3), expect3)
def testFreeform(self): infobox = '''{{Infobox settlement <!--See Template:Infobox settlement for additional fields that may be available--> <!--See the Table at Infobox settlement for all fields and descriptions of usage--> <!-- General information ---------------> |timezone = [[Eastern Time Zone|Eastern Standard Time]] |utc_offset = -5 }}''' self.assertEqual(unwiki.loads(infobox), '') markup = """{{about|the borough in New York City}}\n'''Staten Island ''' {{IPAc-en|ˌ|s|t|æ|t|ən|_|ˈ|aɪ|l|ə|n|d}} is one of the five [[borough (New York City)|boroughs]] of [[New York City]], in the U.S. state of [[New York]].""" expect = "\nStaten Island is one of the five boroughs of New York City, in the U.S. state of New York." self.assertEqual(unwiki.loads(markup), expect) markup = """In the southwest of the city, Staten Island is the southernmost part of both the city and state of New York, with [[Conference House Park]] at the southern tip of the island and the state.<ref>{{cite web|website=http://www.nycgovparks.org/parks/conferencehousepark|title=Conference House Park|publisher=New York City Parks|accessdate=June 21, 2014}}</ref>""" expect = """In the southwest of the city, Staten Island is the southernmost part of both the city and state of New York, with Conference House Park at the southern tip of the island and the state.""" self.assertEqual(unwiki.loads(markup), expect)
def unWikifyString(self, s): """ Removes Wiki formatting from a string. """ unWikifiedString = unwiki.loads(s) wordList = unWikifiedString.split() i = 0 while i < len(wordList): # Remove words containing a pipe character if wordList[i].find('|') > -1: del wordList[i] else: i += 1 return ' '.join(wordList)
def endElement(self, name): if name == 'page': for _ in ['text', 'title']: self.page[_] = convhans(unwiki.loads(self.page[_].strip())) if self.tester(self.page): print(self.page['title'], self.page['id']) self.z.writestr( '{title}_{id}.txt'.format(**self.page), '''{title}\n===========\n\n{text}\nhttps://zh.wikipedia.org/wiki/{title}\n''' .format(**self.page)) self.tags.pop()
def testLink(self): self.assertEqual(unwiki.loads('etc [[relative|link]] foo'), 'etc link foo') assert unwiki.loads('[[link]]') == 'link' self.assertEqual(unwiki.loads('[[relative link|link]]'), 'link') self.assertEqual(unwiki.loads('etc [[relative-link|link]] foo'), 'etc link foo') assert unwiki.loads('[[link (subject)|link]]') == 'link' assert unwiki.loads('[[Bar, Foo|Baz]], [[Foo]]') == 'Baz, Foo'
def main(): with open("snippet.txt") as f: content = f.read() print("Original:") print(content) print("=" * 100) print("WikiClean output:") print(wikiclean.clean(content)) try: import unwiki print("=" * 100) print("UnWiki output:") print(unwiki.loads(content)) except ImportError: pass try: import dewiki import dewiki.parser as parser print("=" * 100) print("DeWiki output:") print(parser.Parser().parse_string(content)) except ImportError: pass
def testCompressSpaces(self): self.assertEqual( unwiki.loads('removing this {{thing}} leaves extra spaces', True), 'removing this leaves extra spaces')
if np.sum([use_stemmer, use_lemmatiser]) > 1: print('Choose only one option or none among: use_stemmer, use_lemmatiser') sys.exit() if not os.path.exists(output_path): os.makedirs(output_path) # If this is already created, skip it if not os.path.exists(output_path + 'features/ocurrences_matrix_cookbook.npy'): # Get all the scrapped Cookbook files wiki_files = sorted( [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]) # Clean the corpus corpus = [] for wiki_file in wiki_files: text = unwiki.loads(' '.join(open(path + wiki_file))) text = clean_text(text, stemmer, lemmatiser) with open(output_path + wiki_file, 'w') as f: f.write(text) corpus.extend(text.split('. ')) # Compute the occurences matrix features_output_path = output_path + 'features/' if not os.path.exists(features_output_path): os.makedirs(features_output_path) if not os.path.exists(new_output_path + 'ocurrences_matrix_cookbook.npy'): print('Computing occurrences') cv = CountVectorizer(ngram_range=(ngram_size, ngram_size))
def testBracketFilenames(self): markup = """[[image:050712_perm_3.png|thumb|upright=1.7|Diagram of a cyclic permutation with two fixed points; a 6-cycle and two 1-cycles. |190x190px]] A [[permutation]] is called""" expect = "\nA permutation is called" self.assertEqual(unwiki.loads(markup), expect)
def testBlockRemoval(self): markup1 = "this is a \n<blockquote>\n macizo\nhello\n</blockquote>" expect1 = "this is a \n\n macizo\nhello\n" self.assertEqual(unwiki.loads(markup1), expect1)
def testList(self): lis = '* foo\n * bar\n ** [[baz]]' self.assertEqual(unwiki.loads(lis), "* foo\n * bar\n ** baz")
def testNestedFileBracketRemoval(self): markup1 = """[[File:LA-Triceratops mount-2.jpg|thumb|250px|left|''[[Triceratops]]'' skeleton, [[Natural History Museum of Los Angeles County]]]] Under [[phylogenetic nomenclature]], dinosaurs""" expect1 = """\nUnder phylogenetic nomenclature, dinosaurs""" self.assertEqual(unwiki.loads(markup1), expect1)
def testCompressSpaces(self): self.assertEqual(unwiki.loads('removing this {{thing}} leaves extra spaces', True), 'removing this leaves extra spaces')
def testHeadline(self): self.assertEqual(unwiki.loads('=== Head ==='), ' Head ') self.assertEqual(unwiki.loads('=== Head ===\nText'), ' Head \nText')
def testComment(self): assert unwiki.loads('<!-- comment -->foo') == 'foo'
def getLocationEmbeddinsFromWikipedia(processName, existingEmbeddinsIDs, startFromFileWithIndex, stopAtFile): print("Starting " + processName) n = 0 createNewFile = True for line in smart_open(wikipediaDumpJSON): # Load into a dictionary article = json.loads(line.decode('utf-8')) if n < startFromFileWithIndex: n += 1 continue if n > stopAtFile: locationEmbeddins.close() break if int(article['articleID']) in existingEmbeddinsIDs[int( article['articleID']) % hashSize]: #logger.info ("[{}] Embeddings for article with title {} is already created".format(processName, article['title'])) continue if createNewFile: locationEmbeddins = open( './outputs/locationEmbeddins_{}_{}.txt'.format( processName, str(n)), 'w') createNewFile = False logger.info("[{}]: Parsing article {}: {}".format( processName, str(n), article['title'])) locationsInArticle = [] for section_title, section_text in zip(article['section_titles'], article['section_texts']): # Remove wiki markups and HTML tags section_text = unwiki.loads(section_text, compress_spaces=True) section_text = re.sub(r'<.*?>', '', section_text) # Remove parethesis section_text = re.sub("[($@*&?].*[$)@*&?]", "", section_text) # Tokenize into sentences senteces_in_section = sent_tokenize(section_text) # Perform Named entity recoginition at a sentence level: for sentence in senteces_in_section: signal.signal(signal.SIGALRM, handler) signal.alarm(10) try: listOfLocations = nerObj.getListOfLocationInSentece( sentence) except: continue signal.alarm(0) locationsInArticle.extend(listOfLocations) n += 1 locationEmbeddins.write("{}\t{}\t{}\n".format( article['articleID'], article['title'], ";".join(locationsInArticle))) # Create new file every 10000 interations (just in case the script crash in the middle) if n % 1000 == 0: locationEmbeddins.close() createNewFile = True print("Exiting " + processName)