Exemple #1
0
 def testMathRemoval(self):
     markup1 = "If Z<sub>1</sub>, ..., ''Z''<sub>''k''</sub> are"
     expect1 = "If _inline_math_, ..., _inline_math_ are"
     markup2 = " (4 × 10<sup>12</sup> watts"
     expect2 = " (4 × _inline_math_ watts"
     self.assertEqual(unwiki.loads(markup1), expect1)
     self.assertEqual(unwiki.loads(markup2), expect2)
Exemple #2
0
 def testRefRemoval(self):
     markup1 = 'the best of a nation.<ref name="AdvisoryCommittee" />  In this way'
     expect1 = "the best of a nation.  In this way"
     self.assertEqual(unwiki.loads(markup1), expect1)
     markup2 = """[[Jacques Le Goff]]<ref name="Le Goff">Le Goff, Jacques. ''La civilisation de l'Occident médieval''. Paris. 1964; English translation (1988): ''Medieval Civilization'', {{ISBN|0-631-17566-0}} &ndash; "translatio imperii" is discussed in Part II, Chapter VI, section on "Time, eternity and history".</ref> describes"""
     expect2 = """Jacques Le Goff describes"""
     self.assertEqual(unwiki.loads(markup2), expect2)
Exemple #3
0
 def testREFTagIsConsumedCorrectly(self):
     markup1 = "hi <ref I should not see this/> And I should see this <ref> this not</ref>"
     expect1 = "hi  And I should see this "
     markup2 = "Now <ref>Remove This</ref> and forget <ref about this/>"
     expect2 = "Now  and forget "
     self.assertEqual(unwiki.loads(markup1), expect1)
     self.assertEqual(unwiki.loads(markup2), expect2)
Exemple #4
0
    def testInfobox(self):
        self.assertEqual(unwiki.loads('{{Infobox none}} None'), ' None')
        self.assertEqual(unwiki.loads('{{foo bar}}'), '')
        self.assertEqual(unwiki.loads("""{{foo\nbar}}"""), '')

        self.assertEqual(unwiki.loads("""{{Infobox
            foo}} None"""), ' None')
Exemple #5
0
 def testHTMLspaces(self):
     markup1 = "Let  &nbsp;''X''&nbsp;  be a non-negative integer and &nbsp;''n''&nbsp;"
     expect1 = "Let   X   be a non-negative integer and  n "
     self.assertEqual(unwiki.loads(markup1), expect1)
     markup2 = "this should be a &lt;; and a &gt;"
     expect2 = "this should be a <; and a >"
     self.assertEqual(unwiki.loads(markup2), expect2)
Exemple #6
0
    def testInfobox(self):
        self.assertEqual(unwiki.loads('{{Infobox none}} None'), ' None')
        self.assertEqual(unwiki.loads('{{foo bar}}'), '')
        self.assertEqual(unwiki.loads("""{{foo\nbar}}"""), '')

        self.assertEqual(unwiki.loads("""{{Infobox
            foo}} None"""), ' None')
Exemple #7
0
 def testNestedCurlyBracketRemoval(self):
     markup1 = ''' Trying out {{the removal {{nested curly brackets}}}}'''
     expect1 = ' Trying out '
     markup2 = ''' Trying out {{the removal {{nested curly brackets}} this is looking pretty good }}'''
     expect2 = ' Trying out '
     markup3 = ''' Trying out If {{nowrap|log\u2009\'\'f\'\'(\'\'x\'\'; \'\'θ\'\')}} is {{nowrap| log θ the removal }}'''
     expect3 = ' Trying out If  is '
     self.assertEqual(unwiki.loads(markup1), expect1)
     self.assertEqual(unwiki.loads(markup2), expect2)
     self.assertEqual(unwiki.loads(markup3), expect3)
Exemple #8
0
def writeQuotes(content):
	global langArg
	global cutoffArg

	quoteList = []
	write = False
	i = 0

	while i < len(content):
		line = content[i]

		if line.startswith('==') and line[2] != "=":
			write = False
		if write and line.startswith('* ') and len(line) < (cutoffArg + 3):

			# would optimize, but since the program only needs to be run once, not really a priority
			cleaned_line = unwiki.loads(line) + '\n'
			cleaned_line = multireplace(cleaned_line, {"\\u2018": "'", "\\u2019": "'", "\\u2026": "...", "\\u2013": "-", "\\u2014": "-", "\\u201c": '"', "\\u201d": '"', "\\'": "'", "'''": "", "\n": ""})
			cleaned_line = re.sub(r"<.*>|'('+)|\\\\x..|\\u....", "", cleaned_line)
			cleaned_line = re.sub(r' +', ' ', cleaned_line)
			cleaned_line = cleaned_line[2:]

			if (detect(cleaned_line) == langArg and "://" not in cleaned_line):
				quoteList.append(cleaned_line)

		if line == '==Quotes==' or line == '== Quotes ==':
			write = True
		i += 1
	
	return quoteList
Exemple #9
0
    def testMath(self):
        markup1 = "the field {{math|'''R'''}} of real numbers"
        expect1 = "the field _inline_math_ of real numbers"
        self.assertEqual(unwiki.loads(markup1), expect1)
        markup2 = "the field {{  math |'''R'''}} of real numbers"
        expect2 = "the field _inline_math_ of real numbers"
        self.assertEqual(unwiki.loads(markup2), expect2)
        # Check the same for the mvar teplate
        markup1 = "the field {{mvar|'''R'''}} of real numbers"
        expect1 = "the field _inline_math_ of real numbers"
        self.assertEqual(unwiki.loads(markup1), expect1)
        markup2 = "the field {{  mvar |'''R'''}} of real numbers"
        expect2 = "the field _inline_math_ of real numbers"
        self.assertEqual(unwiki.loads(markup2), expect2)

        # math tags
        markup3 = "with a [[norm (mathematics)|norm]] <math>\|\cdot\|_X</math>"
        expect3 = "with a norm _inline_math_"
        self.assertEqual(unwiki.loads(markup3), expect3)
Exemple #10
0
    def testFreeform(self):

        infobox = '''{{Infobox settlement
        <!--See Template:Infobox settlement for additional fields that may be available-->
        <!--See the Table at Infobox settlement for all fields and descriptions of usage-->
        <!-- General information  --------------->
        |timezone               = [[Eastern Time Zone|Eastern Standard Time]]
        |utc_offset             = -5
        }}'''

        self.assertEqual(unwiki.loads(infobox), '')

        markup = """{{about|the borough in New York City}}\n'''Staten Island ''' {{IPAc-en|ˌ|s|t|æ|t|ən|_|ˈ|aɪ|l|ə|n|d}} is one of the five [[borough (New York City)|boroughs]] of [[New York City]], in the U.S. state of [[New York]]."""
        expect = "\nStaten Island   is one of the five boroughs of New York City, in the U.S. state of New York."

        self.assertEqual(unwiki.loads(markup), expect)

        markup = """In the southwest of the city, Staten Island is the southernmost part of both the city and state of New York, with [[Conference House Park]] at the southern tip of the island and the state.<ref>{{cite web|website=http://www.nycgovparks.org/parks/conferencehousepark|title=Conference House Park|publisher=New York City Parks|accessdate=June 21, 2014}}</ref>"""
        expect = """In the southwest of the city, Staten Island is the southernmost part of both the city and state of New York, with Conference House Park at the southern tip of the island and the state."""
        self.assertEqual(unwiki.loads(markup), expect)
Exemple #11
0
    def testFreeform(self):

        infobox = '''{{Infobox settlement
        <!--See Template:Infobox settlement for additional fields that may be available-->
        <!--See the Table at Infobox settlement for all fields and descriptions of usage-->
        <!-- General information  --------------->
        |timezone               = [[Eastern Time Zone|Eastern Standard Time]]
        |utc_offset             = -5
        }}'''

        self.assertEqual(unwiki.loads(infobox), '')

        markup = """{{about|the borough in New York City}}\n'''Staten Island ''' {{IPAc-en|ˌ|s|t|æ|t|ən|_|ˈ|aɪ|l|ə|n|d}} is one of the five [[borough (New York City)|boroughs]] of [[New York City]], in the U.S. state of [[New York]]."""
        expect = "\nStaten Island   is one of the five boroughs of New York City, in the U.S. state of New York."

        self.assertEqual(unwiki.loads(markup), expect)

        markup = """In the southwest of the city, Staten Island is the southernmost part of both the city and state of New York, with [[Conference House Park]] at the southern tip of the island and the state.<ref>{{cite web|website=http://www.nycgovparks.org/parks/conferencehousepark|title=Conference House Park|publisher=New York City Parks|accessdate=June 21, 2014}}</ref>"""
        expect = """In the southwest of the city, Staten Island is the southernmost part of both the city and state of New York, with Conference House Park at the southern tip of the island and the state."""
        self.assertEqual(unwiki.loads(markup), expect)
Exemple #12
0
 def unWikifyString(self, s):
     """ Removes Wiki formatting from a string. """
     unWikifiedString = unwiki.loads(s)
     wordList = unWikifiedString.split()
     i = 0
     while i < len(wordList):
         # Remove words containing a pipe character
         if wordList[i].find('|') > -1:
             del wordList[i]
         else:
             i += 1
     return ' '.join(wordList)
Exemple #13
0
    def endElement(self, name):
        if name == 'page':
            for _ in ['text', 'title']:
                self.page[_] = convhans(unwiki.loads(self.page[_].strip()))

            if self.tester(self.page):
                print(self.page['title'], self.page['id'])
                self.z.writestr(
                    '{title}_{id}.txt'.format(**self.page),
                    '''{title}\n===========\n\n{text}\nhttps://zh.wikipedia.org/wiki/{title}\n'''
                    .format(**self.page))

        self.tags.pop()
Exemple #14
0
    def testLink(self):
        self.assertEqual(unwiki.loads('etc [[relative|link]] foo'), 'etc link foo')
        assert unwiki.loads('[[link]]') == 'link'
        self.assertEqual(unwiki.loads('[[relative link|link]]'), 'link')
        self.assertEqual(unwiki.loads('etc [[relative-link|link]] foo'), 'etc link foo')
        assert unwiki.loads('[[link (subject)|link]]') == 'link'

        assert unwiki.loads('[[Bar, Foo|Baz]], [[Foo]]') == 'Baz, Foo'
Exemple #15
0
    def testLink(self):
        self.assertEqual(unwiki.loads('etc [[relative|link]] foo'), 'etc link foo')
        assert unwiki.loads('[[link]]') == 'link'
        self.assertEqual(unwiki.loads('[[relative link|link]]'), 'link')
        self.assertEqual(unwiki.loads('etc [[relative-link|link]] foo'), 'etc link foo')
        assert unwiki.loads('[[link (subject)|link]]') == 'link'

        assert unwiki.loads('[[Bar, Foo|Baz]], [[Foo]]') == 'Baz, Foo'
Exemple #16
0
def main():
    with open("snippet.txt") as f:
        content = f.read()
    print("Original:")
    print(content)
    print("=" * 100)
    print("WikiClean output:")
    print(wikiclean.clean(content))
    try:
        import unwiki
        print("=" * 100)
        print("UnWiki output:")
        print(unwiki.loads(content))
    except ImportError:
        pass
    try:
        import dewiki
        import dewiki.parser as parser
        print("=" * 100)
        print("DeWiki output:")
        print(parser.Parser().parse_string(content))
    except ImportError:
        pass
Exemple #17
0
 def testCompressSpaces(self):
     self.assertEqual(
         unwiki.loads('removing this {{thing}} leaves extra spaces', True),
         'removing this leaves extra spaces')
Exemple #18
0
if np.sum([use_stemmer, use_lemmatiser]) > 1:
    print('Choose only one option or none among: use_stemmer, use_lemmatiser')
    sys.exit()

if not os.path.exists(output_path):
    os.makedirs(output_path)

# If this is already created, skip it
if not os.path.exists(output_path + 'features/ocurrences_matrix_cookbook.npy'):
    # Get all the scrapped Cookbook files
    wiki_files = sorted(
        [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))])
    # Clean the corpus
    corpus = []
    for wiki_file in wiki_files:
        text = unwiki.loads(' '.join(open(path + wiki_file)))

        text = clean_text(text, stemmer, lemmatiser)

        with open(output_path + wiki_file, 'w') as f:
            f.write(text)
        corpus.extend(text.split('. '))

    # Compute the occurences matrix
    features_output_path = output_path + 'features/'
    if not os.path.exists(features_output_path):
        os.makedirs(features_output_path)

    if not os.path.exists(new_output_path + 'ocurrences_matrix_cookbook.npy'):
        print('Computing occurrences')
        cv = CountVectorizer(ngram_range=(ngram_size, ngram_size))
Exemple #19
0
    def testBracketFilenames(self):
        markup = """[[image:050712_perm_3.png|thumb|upright=1.7|Diagram of a cyclic permutation with two fixed points; a 6-cycle and two 1-cycles. |190x190px]]
A [[permutation]] is called"""
        expect = "\nA permutation is called"
        self.assertEqual(unwiki.loads(markup), expect)
Exemple #20
0
 def testBlockRemoval(self):
     markup1 = "this is a \n<blockquote>\n macizo\nhello\n</blockquote>"
     expect1 = "this is a \n\n macizo\nhello\n"
     self.assertEqual(unwiki.loads(markup1), expect1)
Exemple #21
0
 def testList(self):
     lis = '* foo\n * bar\n ** [[baz]]'
     self.assertEqual(unwiki.loads(lis), "* foo\n * bar\n ** baz")
Exemple #22
0
    def testNestedFileBracketRemoval(self):
        markup1 = """[[File:LA-Triceratops mount-2.jpg|thumb|250px|left|''[[Triceratops]]'' skeleton, [[Natural History Museum of Los Angeles County]]]]
Under [[phylogenetic nomenclature]], dinosaurs"""
        expect1 = """\nUnder phylogenetic nomenclature, dinosaurs""" 
        self.assertEqual(unwiki.loads(markup1), expect1)
Exemple #23
0
 def testCompressSpaces(self):
     self.assertEqual(unwiki.loads('removing this {{thing}} leaves extra spaces', True), 'removing this leaves extra spaces')
Exemple #24
0
 def testHeadline(self):
     self.assertEqual(unwiki.loads('=== Head ==='), ' Head ')
     self.assertEqual(unwiki.loads('=== Head ===\nText'), ' Head \nText')
Exemple #25
0
 def testComment(self):
     assert unwiki.loads('<!-- comment -->foo') == 'foo'
Exemple #26
0
 def testList(self):
     lis = '* foo\n * bar\n ** [[baz]]'
     self.assertEqual(unwiki.loads(lis), "* foo\n * bar\n ** baz")
Exemple #27
0
 def testComment(self):
     assert unwiki.loads('<!-- comment -->foo') == 'foo'
Exemple #28
0
def getLocationEmbeddinsFromWikipedia(processName, existingEmbeddinsIDs,
                                      startFromFileWithIndex, stopAtFile):

    print("Starting " + processName)

    n = 0
    createNewFile = True
    for line in smart_open(wikipediaDumpJSON):

        # Load into a dictionary
        article = json.loads(line.decode('utf-8'))

        if n < startFromFileWithIndex:
            n += 1
            continue

        if n > stopAtFile:
            locationEmbeddins.close()
            break

        if int(article['articleID']) in existingEmbeddinsIDs[int(
                article['articleID']) % hashSize]:
            #logger.info ("[{}] Embeddings for article with title {} is already created".format(processName, article['title']))
            continue

        if createNewFile:
            locationEmbeddins = open(
                './outputs/locationEmbeddins_{}_{}.txt'.format(
                    processName, str(n)), 'w')
            createNewFile = False

        logger.info("[{}]: Parsing article {}: {}".format(
            processName, str(n), article['title']))

        locationsInArticle = []
        for section_title, section_text in zip(article['section_titles'],
                                               article['section_texts']):

            # Remove wiki markups and HTML tags
            section_text = unwiki.loads(section_text, compress_spaces=True)
            section_text = re.sub(r'<.*?>', '', section_text)

            # Remove parethesis
            section_text = re.sub("[($@*&?].*[$)@*&?]", "", section_text)

            # Tokenize into sentences
            senteces_in_section = sent_tokenize(section_text)

            # Perform Named entity recoginition at a sentence level:
            for sentence in senteces_in_section:
                signal.signal(signal.SIGALRM, handler)
                signal.alarm(10)
                try:
                    listOfLocations = nerObj.getListOfLocationInSentece(
                        sentence)
                except:
                    continue
                signal.alarm(0)
                locationsInArticle.extend(listOfLocations)

        n += 1
        locationEmbeddins.write("{}\t{}\t{}\n".format(
            article['articleID'], article['title'],
            ";".join(locationsInArticle)))

        # Create new file every 10000 interations (just in case the script crash in the middle)
        if n % 1000 == 0:
            locationEmbeddins.close()
            createNewFile = True

    print("Exiting " + processName)
Exemple #29
0
 def testHeadline(self):
     self.assertEqual(unwiki.loads('=== Head ==='), ' Head ')
     self.assertEqual(unwiki.loads('=== Head ===\nText'), ' Head \nText')