Python loads Examples, unwiki.loads Python Examples

Example #1

0

Show file

 def testMathRemoval(self):
     markup1 = "If Z<sub>1</sub>, ..., ''Z''<sub>''k''</sub> are"
     expect1 = "If _inline_math_, ..., _inline_math_ are"
     markup2 = " (4 × 10<sup>12</sup> watts"
     expect2 = " (4 × _inline_math_ watts"
     self.assertEqual(unwiki.loads(markup1), expect1)
     self.assertEqual(unwiki.loads(markup2), expect2)

Example #2

0

Show file

 def testRefRemoval(self):
     markup1 = 'the best of a nation.<ref name="AdvisoryCommittee" />  In this way'
     expect1 = "the best of a nation.  In this way"
     self.assertEqual(unwiki.loads(markup1), expect1)
     markup2 = """[[Jacques Le Goff]]<ref name="Le Goff">Le Goff, Jacques. ''La civilisation de l'Occident médieval''. Paris. 1964; English translation (1988): ''Medieval Civilization'', {{ISBN|0-631-17566-0}} &ndash; "translatio imperii" is discussed in Part II, Chapter VI, section on "Time, eternity and history".</ref> describes"""
     expect2 = """Jacques Le Goff describes"""
     self.assertEqual(unwiki.loads(markup2), expect2)

Example #3

0

Show file

 def testREFTagIsConsumedCorrectly(self):
     markup1 = "hi <ref I should not see this/> And I should see this <ref> this not</ref>"
     expect1 = "hi  And I should see this "
     markup2 = "Now <ref>Remove This</ref> and forget <ref about this/>"
     expect2 = "Now  and forget "
     self.assertEqual(unwiki.loads(markup1), expect1)
     self.assertEqual(unwiki.loads(markup2), expect2)

Example #4

0

Show file

File: __init__.py Project: fitnr/unwiki

    def testInfobox(self):
        self.assertEqual(unwiki.loads('{{Infobox none}} None'), ' None')
        self.assertEqual(unwiki.loads('{{foo bar}}'), '')
        self.assertEqual(unwiki.loads("""{{foo\nbar}}"""), '')

        self.assertEqual(unwiki.loads("""{{Infobox
            foo}} None"""), ' None')

Example #5

0

Show file

 def testHTMLspaces(self):
     markup1 = "Let  &nbsp;''X''&nbsp;  be a non-negative integer and &nbsp;''n''&nbsp;"
     expect1 = "Let   X   be a non-negative integer and  n "
     self.assertEqual(unwiki.loads(markup1), expect1)
     markup2 = "this should be a &lt;; and a &gt;"
     expect2 = "this should be a <; and a >"
     self.assertEqual(unwiki.loads(markup2), expect2)

Example #6

0

Show file

File: __init__.py Project: abeusher/unwiki

    def testInfobox(self):
        self.assertEqual(unwiki.loads('{{Infobox none}} None'), ' None')
        self.assertEqual(unwiki.loads('{{foo bar}}'), '')
        self.assertEqual(unwiki.loads("""{{foo\nbar}}"""), '')

        self.assertEqual(unwiki.loads("""{{Infobox
            foo}} None"""), ' None')

Example #7

0

Show file

 def testNestedCurlyBracketRemoval(self):
     markup1 = ''' Trying out {{the removal {{nested curly brackets}}}}'''
     expect1 = ' Trying out '
     markup2 = ''' Trying out {{the removal {{nested curly brackets}} this is looking pretty good }}'''
     expect2 = ' Trying out '
     markup3 = ''' Trying out If {{nowrap|log\u2009\'\'f\'\'(\'\'x\'\'; \'\'θ\'\')}} is {{nowrap| log θ the removal }}'''
     expect3 = ' Trying out If  is '
     self.assertEqual(unwiki.loads(markup1), expect1)
     self.assertEqual(unwiki.loads(markup2), expect2)
     self.assertEqual(unwiki.loads(markup3), expect3)

Example #8

0

Show file

def writeQuotes(content):
	global langArg
	global cutoffArg

	quoteList = []
	write = False
	i = 0

	while i < len(content):
		line = content[i]

		if line.startswith('==') and line[2] != "=":
			write = False
		if write and line.startswith('* ') and len(line) < (cutoffArg + 3):

			# would optimize, but since the program only needs to be run once, not really a priority
			cleaned_line = unwiki.loads(line) + '\n'
			cleaned_line = multireplace(cleaned_line, {"\\u2018": "'", "\\u2019": "'", "\\u2026": "...", "\\u2013": "-", "\\u2014": "-", "\\u201c": '"', "\\u201d": '"', "\\'": "'", "'''": "", "\n": ""})
			cleaned_line = re.sub(r"<.*>|'('+)|\\\\x..|\\u....", "", cleaned_line)
			cleaned_line = re.sub(r' +', ' ', cleaned_line)
			cleaned_line = cleaned_line[2:]

			if (detect(cleaned_line) == langArg and "://" not in cleaned_line):
				quoteList.append(cleaned_line)

		if line == '==Quotes==' or line == '== Quotes ==':
			write = True
		i += 1
	
	return quoteList

Example #9

0

Show file

    def testMath(self):
        markup1 = "the field {{math|'''R'''}} of real numbers"
        expect1 = "the field _inline_math_ of real numbers"
        self.assertEqual(unwiki.loads(markup1), expect1)
        markup2 = "the field {{  math |'''R'''}} of real numbers"
        expect2 = "the field _inline_math_ of real numbers"
        self.assertEqual(unwiki.loads(markup2), expect2)
        # Check the same for the mvar teplate
        markup1 = "the field {{mvar|'''R'''}} of real numbers"
        expect1 = "the field _inline_math_ of real numbers"
        self.assertEqual(unwiki.loads(markup1), expect1)
        markup2 = "the field {{  mvar |'''R'''}} of real numbers"
        expect2 = "the field _inline_math_ of real numbers"
        self.assertEqual(unwiki.loads(markup2), expect2)

        # math tags
        markup3 = "with a [[norm (mathematics)|norm]] <math>\|\cdot\|_X</math>"
        expect3 = "with a norm _inline_math_"
        self.assertEqual(unwiki.loads(markup3), expect3)

Example #10

0

Show file

File: __init__.py Project: abeusher/unwiki

    def testFreeform(self):

        infobox = '''{{Infobox settlement
        <!--See Template:Infobox settlement for additional fields that may be available-->
        <!--See the Table at Infobox settlement for all fields and descriptions of usage-->
        <!-- General information  --------------->
        |timezone               = [[Eastern Time Zone|Eastern Standard Time]]
        |utc_offset             = -5
        }}'''

        self.assertEqual(unwiki.loads(infobox), '')

        markup = """{{about|the borough in New York City}}\n'''Staten Island ''' {{IPAc-en|ˌ|s|t|æ|t|ən|_|ˈ|aɪ|l|ə|n|d}} is one of the five [[borough (New York City)|boroughs]] of [[New York City]], in the U.S. state of [[New York]]."""
        expect = "\nStaten Island   is one of the five boroughs of New York City, in the U.S. state of New York."

        self.assertEqual(unwiki.loads(markup), expect)

        markup = """In the southwest of the city, Staten Island is the southernmost part of both the city and state of New York, with [[Conference House Park]] at the southern tip of the island and the state.<ref>{{cite web|website=http://www.nycgovparks.org/parks/conferencehousepark|title=Conference House Park|publisher=New York City Parks|accessdate=June 21, 2014}}</ref>"""
        expect = """In the southwest of the city, Staten Island is the southernmost part of both the city and state of New York, with Conference House Park at the southern tip of the island and the state."""
        self.assertEqual(unwiki.loads(markup), expect)

Example #11

0

Show file

File: __init__.py Project: fitnr/unwiki

    def testFreeform(self):

        infobox = '''{{Infobox settlement
        <!--See Template:Infobox settlement for additional fields that may be available-->
        <!--See the Table at Infobox settlement for all fields and descriptions of usage-->
        <!-- General information  --------------->
        |timezone               = [[Eastern Time Zone|Eastern Standard Time]]
        |utc_offset             = -5
        }}'''

        self.assertEqual(unwiki.loads(infobox), '')

        markup = """{{about|the borough in New York City}}\n'''Staten Island ''' {{IPAc-en|ˌ|s|t|æ|t|ən|_|ˈ|aɪ|l|ə|n|d}} is one of the five [[borough (New York City)|boroughs]] of [[New York City]], in the U.S. state of [[New York]]."""
        expect = "\nStaten Island   is one of the five boroughs of New York City, in the U.S. state of New York."

        self.assertEqual(unwiki.loads(markup), expect)

        markup = """In the southwest of the city, Staten Island is the southernmost part of both the city and state of New York, with [[Conference House Park]] at the southern tip of the island and the state.<ref>{{cite web|website=http://www.nycgovparks.org/parks/conferencehousepark|title=Conference House Park|publisher=New York City Parks|accessdate=June 21, 2014}}</ref>"""
        expect = """In the southwest of the city, Staten Island is the southernmost part of both the city and state of New York, with Conference House Park at the southern tip of the island and the state."""
        self.assertEqual(unwiki.loads(markup), expect)

Example #12

0

Show file

File: wiki.py Project: mcmont/mldemos

 def unWikifyString(self, s):
     """ Removes Wiki formatting from a string. """
     unWikifiedString = unwiki.loads(s)
     wordList = unWikifiedString.split()
     i = 0
     while i < len(wordList):
         # Remove words containing a pipe character
         if wordList[i].find('|') > -1:
             del wordList[i]
         else:
             i += 1
     return ' '.join(wordList)

Example #13

0

Show file

File: extract.py Project: zhuth/wiki_extractor

    def endElement(self, name):
        if name == 'page':
            for _ in ['text', 'title']:
                self.page[_] = convhans(unwiki.loads(self.page[_].strip()))

            if self.tester(self.page):
                print(self.page['title'], self.page['id'])
                self.z.writestr(
                    '{title}_{id}.txt'.format(**self.page),
                    '''{title}\n===========\n\n{text}\nhttps://zh.wikipedia.org/wiki/{title}\n'''
                    .format(**self.page))

        self.tags.pop()

Example #14

0

Show file

    def testLink(self):
        self.assertEqual(unwiki.loads('etc [[relative|link]] foo'), 'etc link foo')
        assert unwiki.loads('[[link]]') == 'link'
        self.assertEqual(unwiki.loads('[[relative link|link]]'), 'link')
        self.assertEqual(unwiki.loads('etc [[relative-link|link]] foo'), 'etc link foo')
        assert unwiki.loads('[[link (subject)|link]]') == 'link'

        assert unwiki.loads('[[Bar, Foo|Baz]], [[Foo]]') == 'Baz, Foo'

Example #15

0

Show file

File: __init__.py Project: fitnr/unwiki

    def testLink(self):
        self.assertEqual(unwiki.loads('etc [[relative|link]] foo'), 'etc link foo')
        assert unwiki.loads('[[link]]') == 'link'
        self.assertEqual(unwiki.loads('[[relative link|link]]'), 'link')
        self.assertEqual(unwiki.loads('etc [[relative-link|link]] foo'), 'etc link foo')
        assert unwiki.loads('[[link (subject)|link]]') == 'link'

        assert unwiki.loads('[[Bar, Foo|Baz]], [[Foo]]') == 'Baz, Foo'

Example #16

0

Show file

File: clean_snippet.py Project: daemon/pywikiclean

def main():
    with open("snippet.txt") as f:
        content = f.read()
    print("Original:")
    print(content)
    print("=" * 100)
    print("WikiClean output:")
    print(wikiclean.clean(content))
    try:
        import unwiki
        print("=" * 100)
        print("UnWiki output:")
        print(unwiki.loads(content))
    except ImportError:
        pass
    try:
        import dewiki
        import dewiki.parser as parser
        print("=" * 100)
        print("DeWiki output:")
        print(parser.Parser().parse_string(content))
    except ImportError:
        pass

Example #17

0

Show file

File: __init__.py Project: abeusher/unwiki

 def testCompressSpaces(self):
     self.assertEqual(
         unwiki.loads('removing this {{thing}} leaves extra spaces', True),
         'removing this leaves extra spaces')

Example #18

0

Show file

if np.sum([use_stemmer, use_lemmatiser]) > 1:
    print('Choose only one option or none among: use_stemmer, use_lemmatiser')
    sys.exit()

if not os.path.exists(output_path):
    os.makedirs(output_path)

# If this is already created, skip it
if not os.path.exists(output_path + 'features/ocurrences_matrix_cookbook.npy'):
    # Get all the scrapped Cookbook files
    wiki_files = sorted(
        [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))])
    # Clean the corpus
    corpus = []
    for wiki_file in wiki_files:
        text = unwiki.loads(' '.join(open(path + wiki_file)))

        text = clean_text(text, stemmer, lemmatiser)

        with open(output_path + wiki_file, 'w') as f:
            f.write(text)
        corpus.extend(text.split('. '))

    # Compute the occurences matrix
    features_output_path = output_path + 'features/'
    if not os.path.exists(features_output_path):
        os.makedirs(features_output_path)

    if not os.path.exists(new_output_path + 'ocurrences_matrix_cookbook.npy'):
        print('Computing occurrences')
        cv = CountVectorizer(ngram_range=(ngram_size, ngram_size))

Example #19

0

Show file

    def testBracketFilenames(self):
        markup = """[[image:050712_perm_3.png|thumb|upright=1.7|Diagram of a cyclic permutation with two fixed points; a 6-cycle and two 1-cycles. |190x190px]]
A [[permutation]] is called"""
        expect = "\nA permutation is called"
        self.assertEqual(unwiki.loads(markup), expect)

Example #20

0

Show file

 def testBlockRemoval(self):
     markup1 = "this is a \n<blockquote>\n macizo\nhello\n</blockquote>"
     expect1 = "this is a \n\n macizo\nhello\n"
     self.assertEqual(unwiki.loads(markup1), expect1)

Example #21

0

Show file

File: __init__.py Project: fitnr/unwiki

 def testList(self):
     lis = '* foo\n * bar\n ** [[baz]]'
     self.assertEqual(unwiki.loads(lis), "* foo\n * bar\n ** baz")

Example #22

0

Show file

    def testNestedFileBracketRemoval(self):
        markup1 = """[[File:LA-Triceratops mount-2.jpg|thumb|250px|left|''[[Triceratops]]'' skeleton, [[Natural History Museum of Los Angeles County]]]]
Under [[phylogenetic nomenclature]], dinosaurs"""
        expect1 = """\nUnder phylogenetic nomenclature, dinosaurs""" 
        self.assertEqual(unwiki.loads(markup1), expect1)

Example #23

0

Show file

File: __init__.py Project: fitnr/unwiki

 def testCompressSpaces(self):
     self.assertEqual(unwiki.loads('removing this {{thing}} leaves extra spaces', True), 'removing this leaves extra spaces')

Example #24

0

Show file

File: __init__.py Project: fitnr/unwiki

 def testHeadline(self):
     self.assertEqual(unwiki.loads('=== Head ==='), ' Head ')
     self.assertEqual(unwiki.loads('=== Head ===\nText'), ' Head \nText')

Example #25

0

Show file

File: __init__.py Project: fitnr/unwiki

 def testComment(self):
     assert unwiki.loads('<!-- comment -->foo') == 'foo'

Example #26

0

Show file

File: __init__.py Project: abeusher/unwiki

 def testList(self):
     lis = '* foo\n * bar\n ** [[baz]]'
     self.assertEqual(unwiki.loads(lis), "* foo\n * bar\n ** baz")

Example #27

0

Show file

File: __init__.py Project: abeusher/unwiki

 def testComment(self):
     assert unwiki.loads('<!-- comment -->foo') == 'foo'

Example #28

0

Show file

def getLocationEmbeddinsFromWikipedia(processName, existingEmbeddinsIDs,
                                      startFromFileWithIndex, stopAtFile):

    print("Starting " + processName)

    n = 0
    createNewFile = True
    for line in smart_open(wikipediaDumpJSON):

        # Load into a dictionary
        article = json.loads(line.decode('utf-8'))

        if n < startFromFileWithIndex:
            n += 1
            continue

        if n > stopAtFile:
            locationEmbeddins.close()
            break

        if int(article['articleID']) in existingEmbeddinsIDs[int(
                article['articleID']) % hashSize]:
            #logger.info ("[{}] Embeddings for article with title {} is already created".format(processName, article['title']))
            continue

        if createNewFile:
            locationEmbeddins = open(
                './outputs/locationEmbeddins_{}_{}.txt'.format(
                    processName, str(n)), 'w')
            createNewFile = False

        logger.info("[{}]: Parsing article {}: {}".format(
            processName, str(n), article['title']))

        locationsInArticle = []
        for section_title, section_text in zip(article['section_titles'],
                                               article['section_texts']):

            # Remove wiki markups and HTML tags
            section_text = unwiki.loads(section_text, compress_spaces=True)
            section_text = re.sub(r'<.*?>', '', section_text)

            # Remove parethesis
            section_text = re.sub("[($@*&?].*[$)@*&?]", "", section_text)

            # Tokenize into sentences
            senteces_in_section = sent_tokenize(section_text)

            # Perform Named entity recoginition at a sentence level:
            for sentence in senteces_in_section:
                signal.signal(signal.SIGALRM, handler)
                signal.alarm(10)
                try:
                    listOfLocations = nerObj.getListOfLocationInSentece(
                        sentence)
                except:
                    continue
                signal.alarm(0)
                locationsInArticle.extend(listOfLocations)

        n += 1
        locationEmbeddins.write("{}\t{}\t{}\n".format(
            article['articleID'], article['title'],
            ";".join(locationsInArticle)))

        # Create new file every 10000 interations (just in case the script crash in the middle)
        if n % 1000 == 0:
            locationEmbeddins.close()
            createNewFile = True

    print("Exiting " + processName)

Example #29

0

Show file

File: __init__.py Project: abeusher/unwiki

 def testHeadline(self):
     self.assertEqual(unwiki.loads('=== Head ==='), ' Head ')
     self.assertEqual(unwiki.loads('=== Head ===\nText'), ' Head \nText')