コード例 #1
0
 def testMathRemoval(self):
     markup1 = "If Z<sub>1</sub>, ..., ''Z''<sub>''k''</sub> are"
     expect1 = "If _inline_math_, ..., _inline_math_ are"
     markup2 = " (4 × 10<sup>12</sup> watts"
     expect2 = " (4 × _inline_math_ watts"
     self.assertEqual(unwiki.loads(markup1), expect1)
     self.assertEqual(unwiki.loads(markup2), expect2)
コード例 #2
0
 def testRefRemoval(self):
     markup1 = 'the best of a nation.<ref name="AdvisoryCommittee" />  In this way'
     expect1 = "the best of a nation.  In this way"
     self.assertEqual(unwiki.loads(markup1), expect1)
     markup2 = """[[Jacques Le Goff]]<ref name="Le Goff">Le Goff, Jacques. ''La civilisation de l'Occident médieval''. Paris. 1964; English translation (1988): ''Medieval Civilization'', {{ISBN|0-631-17566-0}} &ndash; "translatio imperii" is discussed in Part II, Chapter VI, section on "Time, eternity and history".</ref> describes"""
     expect2 = """Jacques Le Goff describes"""
     self.assertEqual(unwiki.loads(markup2), expect2)
コード例 #3
0
 def testREFTagIsConsumedCorrectly(self):
     markup1 = "hi <ref I should not see this/> And I should see this <ref> this not</ref>"
     expect1 = "hi  And I should see this "
     markup2 = "Now <ref>Remove This</ref> and forget <ref about this/>"
     expect2 = "Now  and forget "
     self.assertEqual(unwiki.loads(markup1), expect1)
     self.assertEqual(unwiki.loads(markup2), expect2)
コード例 #4
0
ファイル: __init__.py プロジェクト: fitnr/unwiki
    def testInfobox(self):
        self.assertEqual(unwiki.loads('{{Infobox none}} None'), ' None')
        self.assertEqual(unwiki.loads('{{foo bar}}'), '')
        self.assertEqual(unwiki.loads("""{{foo\nbar}}"""), '')

        self.assertEqual(unwiki.loads("""{{Infobox
            foo}} None"""), ' None')
コード例 #5
0
 def testHTMLspaces(self):
     markup1 = "Let  &nbsp;''X''&nbsp;  be a non-negative integer and &nbsp;''n''&nbsp;"
     expect1 = "Let   X   be a non-negative integer and  n "
     self.assertEqual(unwiki.loads(markup1), expect1)
     markup2 = "this should be a &lt;; and a &gt;"
     expect2 = "this should be a <; and a >"
     self.assertEqual(unwiki.loads(markup2), expect2)
コード例 #6
0
ファイル: __init__.py プロジェクト: abeusher/unwiki
    def testInfobox(self):
        self.assertEqual(unwiki.loads('{{Infobox none}} None'), ' None')
        self.assertEqual(unwiki.loads('{{foo bar}}'), '')
        self.assertEqual(unwiki.loads("""{{foo\nbar}}"""), '')

        self.assertEqual(unwiki.loads("""{{Infobox
            foo}} None"""), ' None')
コード例 #7
0
 def testNestedCurlyBracketRemoval(self):
     markup1 = ''' Trying out {{the removal {{nested curly brackets}}}}'''
     expect1 = ' Trying out '
     markup2 = ''' Trying out {{the removal {{nested curly brackets}} this is looking pretty good }}'''
     expect2 = ' Trying out '
     markup3 = ''' Trying out If {{nowrap|log\u2009\'\'f\'\'(\'\'x\'\'; \'\'θ\'\')}} is {{nowrap| log θ the removal }}'''
     expect3 = ' Trying out If  is '
     self.assertEqual(unwiki.loads(markup1), expect1)
     self.assertEqual(unwiki.loads(markup2), expect2)
     self.assertEqual(unwiki.loads(markup3), expect3)
コード例 #8
0
def writeQuotes(content):
	global langArg
	global cutoffArg

	quoteList = []
	write = False
	i = 0

	while i < len(content):
		line = content[i]

		if line.startswith('==') and line[2] != "=":
			write = False
		if write and line.startswith('* ') and len(line) < (cutoffArg + 3):

			# would optimize, but since the program only needs to be run once, not really a priority
			cleaned_line = unwiki.loads(line) + '\n'
			cleaned_line = multireplace(cleaned_line, {"\\u2018": "'", "\\u2019": "'", "\\u2026": "...", "\\u2013": "-", "\\u2014": "-", "\\u201c": '"', "\\u201d": '"', "\\'": "'", "'''": "", "\n": ""})
			cleaned_line = re.sub(r"<.*>|'('+)|\\\\x..|\\u....", "", cleaned_line)
			cleaned_line = re.sub(r' +', ' ', cleaned_line)
			cleaned_line = cleaned_line[2:]

			if (detect(cleaned_line) == langArg and "://" not in cleaned_line):
				quoteList.append(cleaned_line)

		if line == '==Quotes==' or line == '== Quotes ==':
			write = True
		i += 1
	
	return quoteList
コード例 #9
0
    def testMath(self):
        markup1 = "the field {{math|'''R'''}} of real numbers"
        expect1 = "the field _inline_math_ of real numbers"
        self.assertEqual(unwiki.loads(markup1), expect1)
        markup2 = "the field {{  math |'''R'''}} of real numbers"
        expect2 = "the field _inline_math_ of real numbers"
        self.assertEqual(unwiki.loads(markup2), expect2)
        # Check the same for the mvar teplate
        markup1 = "the field {{mvar|'''R'''}} of real numbers"
        expect1 = "the field _inline_math_ of real numbers"
        self.assertEqual(unwiki.loads(markup1), expect1)
        markup2 = "the field {{  mvar |'''R'''}} of real numbers"
        expect2 = "the field _inline_math_ of real numbers"
        self.assertEqual(unwiki.loads(markup2), expect2)

        # math tags
        markup3 = "with a [[norm (mathematics)|norm]] <math>\|\cdot\|_X</math>"
        expect3 = "with a norm _inline_math_"
        self.assertEqual(unwiki.loads(markup3), expect3)
コード例 #10
0
ファイル: __init__.py プロジェクト: abeusher/unwiki
    def testFreeform(self):

        infobox = '''{{Infobox settlement
        <!--See Template:Infobox settlement for additional fields that may be available-->
        <!--See the Table at Infobox settlement for all fields and descriptions of usage-->
        <!-- General information  --------------->
        |timezone               = [[Eastern Time Zone|Eastern Standard Time]]
        |utc_offset             = -5
        }}'''

        self.assertEqual(unwiki.loads(infobox), '')

        markup = """{{about|the borough in New York City}}\n'''Staten Island ''' {{IPAc-en|ˌ|s|t|æ|t|ən|_|ˈ|aɪ|l|ə|n|d}} is one of the five [[borough (New York City)|boroughs]] of [[New York City]], in the U.S. state of [[New York]]."""
        expect = "\nStaten Island   is one of the five boroughs of New York City, in the U.S. state of New York."

        self.assertEqual(unwiki.loads(markup), expect)

        markup = """In the southwest of the city, Staten Island is the southernmost part of both the city and state of New York, with [[Conference House Park]] at the southern tip of the island and the state.<ref>{{cite web|website=http://www.nycgovparks.org/parks/conferencehousepark|title=Conference House Park|publisher=New York City Parks|accessdate=June 21, 2014}}</ref>"""
        expect = """In the southwest of the city, Staten Island is the southernmost part of both the city and state of New York, with Conference House Park at the southern tip of the island and the state."""
        self.assertEqual(unwiki.loads(markup), expect)
コード例 #11
0
ファイル: __init__.py プロジェクト: fitnr/unwiki
    def testFreeform(self):

        infobox = '''{{Infobox settlement
        <!--See Template:Infobox settlement for additional fields that may be available-->
        <!--See the Table at Infobox settlement for all fields and descriptions of usage-->
        <!-- General information  --------------->
        |timezone               = [[Eastern Time Zone|Eastern Standard Time]]
        |utc_offset             = -5
        }}'''

        self.assertEqual(unwiki.loads(infobox), '')

        markup = """{{about|the borough in New York City}}\n'''Staten Island ''' {{IPAc-en|ˌ|s|t|æ|t|ən|_|ˈ|aɪ|l|ə|n|d}} is one of the five [[borough (New York City)|boroughs]] of [[New York City]], in the U.S. state of [[New York]]."""
        expect = "\nStaten Island   is one of the five boroughs of New York City, in the U.S. state of New York."

        self.assertEqual(unwiki.loads(markup), expect)

        markup = """In the southwest of the city, Staten Island is the southernmost part of both the city and state of New York, with [[Conference House Park]] at the southern tip of the island and the state.<ref>{{cite web|website=http://www.nycgovparks.org/parks/conferencehousepark|title=Conference House Park|publisher=New York City Parks|accessdate=June 21, 2014}}</ref>"""
        expect = """In the southwest of the city, Staten Island is the southernmost part of both the city and state of New York, with Conference House Park at the southern tip of the island and the state."""
        self.assertEqual(unwiki.loads(markup), expect)
コード例 #12
0
ファイル: wiki.py プロジェクト: mcmont/mldemos
 def unWikifyString(self, s):
     """ Removes Wiki formatting from a string. """
     unWikifiedString = unwiki.loads(s)
     wordList = unWikifiedString.split()
     i = 0
     while i < len(wordList):
         # Remove words containing a pipe character
         if wordList[i].find('|') > -1:
             del wordList[i]
         else:
             i += 1
     return ' '.join(wordList)
コード例 #13
0
ファイル: extract.py プロジェクト: zhuth/wiki_extractor
    def endElement(self, name):
        if name == 'page':
            for _ in ['text', 'title']:
                self.page[_] = convhans(unwiki.loads(self.page[_].strip()))

            if self.tester(self.page):
                print(self.page['title'], self.page['id'])
                self.z.writestr(
                    '{title}_{id}.txt'.format(**self.page),
                    '''{title}\n===========\n\n{text}\nhttps://zh.wikipedia.org/wiki/{title}\n'''
                    .format(**self.page))

        self.tags.pop()
コード例 #14
0
    def testLink(self):
        self.assertEqual(unwiki.loads('etc [[relative|link]] foo'), 'etc link foo')
        assert unwiki.loads('[[link]]') == 'link'
        self.assertEqual(unwiki.loads('[[relative link|link]]'), 'link')
        self.assertEqual(unwiki.loads('etc [[relative-link|link]] foo'), 'etc link foo')
        assert unwiki.loads('[[link (subject)|link]]') == 'link'

        assert unwiki.loads('[[Bar, Foo|Baz]], [[Foo]]') == 'Baz, Foo'
コード例 #15
0
ファイル: __init__.py プロジェクト: fitnr/unwiki
    def testLink(self):
        self.assertEqual(unwiki.loads('etc [[relative|link]] foo'), 'etc link foo')
        assert unwiki.loads('[[link]]') == 'link'
        self.assertEqual(unwiki.loads('[[relative link|link]]'), 'link')
        self.assertEqual(unwiki.loads('etc [[relative-link|link]] foo'), 'etc link foo')
        assert unwiki.loads('[[link (subject)|link]]') == 'link'

        assert unwiki.loads('[[Bar, Foo|Baz]], [[Foo]]') == 'Baz, Foo'
コード例 #16
0
ファイル: clean_snippet.py プロジェクト: daemon/pywikiclean
def main():
    with open("snippet.txt") as f:
        content = f.read()
    print("Original:")
    print(content)
    print("=" * 100)
    print("WikiClean output:")
    print(wikiclean.clean(content))
    try:
        import unwiki
        print("=" * 100)
        print("UnWiki output:")
        print(unwiki.loads(content))
    except ImportError:
        pass
    try:
        import dewiki
        import dewiki.parser as parser
        print("=" * 100)
        print("DeWiki output:")
        print(parser.Parser().parse_string(content))
    except ImportError:
        pass
コード例 #17
0
ファイル: __init__.py プロジェクト: abeusher/unwiki
 def testCompressSpaces(self):
     self.assertEqual(
         unwiki.loads('removing this {{thing}} leaves extra spaces', True),
         'removing this leaves extra spaces')
コード例 #18
0
if np.sum([use_stemmer, use_lemmatiser]) > 1:
    print('Choose only one option or none among: use_stemmer, use_lemmatiser')
    sys.exit()

if not os.path.exists(output_path):
    os.makedirs(output_path)

# If this is already created, skip it
if not os.path.exists(output_path + 'features/ocurrences_matrix_cookbook.npy'):
    # Get all the scrapped Cookbook files
    wiki_files = sorted(
        [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))])
    # Clean the corpus
    corpus = []
    for wiki_file in wiki_files:
        text = unwiki.loads(' '.join(open(path + wiki_file)))

        text = clean_text(text, stemmer, lemmatiser)

        with open(output_path + wiki_file, 'w') as f:
            f.write(text)
        corpus.extend(text.split('. '))

    # Compute the occurences matrix
    features_output_path = output_path + 'features/'
    if not os.path.exists(features_output_path):
        os.makedirs(features_output_path)

    if not os.path.exists(new_output_path + 'ocurrences_matrix_cookbook.npy'):
        print('Computing occurrences')
        cv = CountVectorizer(ngram_range=(ngram_size, ngram_size))
コード例 #19
0
    def testBracketFilenames(self):
        markup = """[[image:050712_perm_3.png|thumb|upright=1.7|Diagram of a cyclic permutation with two fixed points; a 6-cycle and two 1-cycles. |190x190px]]
A [[permutation]] is called"""
        expect = "\nA permutation is called"
        self.assertEqual(unwiki.loads(markup), expect)
コード例 #20
0
 def testBlockRemoval(self):
     markup1 = "this is a \n<blockquote>\n macizo\nhello\n</blockquote>"
     expect1 = "this is a \n\n macizo\nhello\n"
     self.assertEqual(unwiki.loads(markup1), expect1)
コード例 #21
0
ファイル: __init__.py プロジェクト: fitnr/unwiki
 def testList(self):
     lis = '* foo\n * bar\n ** [[baz]]'
     self.assertEqual(unwiki.loads(lis), "* foo\n * bar\n ** baz")
コード例 #22
0
    def testNestedFileBracketRemoval(self):
        markup1 = """[[File:LA-Triceratops mount-2.jpg|thumb|250px|left|''[[Triceratops]]'' skeleton, [[Natural History Museum of Los Angeles County]]]]
Under [[phylogenetic nomenclature]], dinosaurs"""
        expect1 = """\nUnder phylogenetic nomenclature, dinosaurs""" 
        self.assertEqual(unwiki.loads(markup1), expect1)
コード例 #23
0
ファイル: __init__.py プロジェクト: fitnr/unwiki
 def testCompressSpaces(self):
     self.assertEqual(unwiki.loads('removing this {{thing}} leaves extra spaces', True), 'removing this leaves extra spaces')
コード例 #24
0
ファイル: __init__.py プロジェクト: fitnr/unwiki
 def testHeadline(self):
     self.assertEqual(unwiki.loads('=== Head ==='), ' Head ')
     self.assertEqual(unwiki.loads('=== Head ===\nText'), ' Head \nText')
コード例 #25
0
ファイル: __init__.py プロジェクト: fitnr/unwiki
 def testComment(self):
     assert unwiki.loads('<!-- comment -->foo') == 'foo'
コード例 #26
0
ファイル: __init__.py プロジェクト: abeusher/unwiki
 def testList(self):
     lis = '* foo\n * bar\n ** [[baz]]'
     self.assertEqual(unwiki.loads(lis), "* foo\n * bar\n ** baz")
コード例 #27
0
ファイル: __init__.py プロジェクト: abeusher/unwiki
 def testComment(self):
     assert unwiki.loads('<!-- comment -->foo') == 'foo'
コード例 #28
0
def getLocationEmbeddinsFromWikipedia(processName, existingEmbeddinsIDs,
                                      startFromFileWithIndex, stopAtFile):

    print("Starting " + processName)

    n = 0
    createNewFile = True
    for line in smart_open(wikipediaDumpJSON):

        # Load into a dictionary
        article = json.loads(line.decode('utf-8'))

        if n < startFromFileWithIndex:
            n += 1
            continue

        if n > stopAtFile:
            locationEmbeddins.close()
            break

        if int(article['articleID']) in existingEmbeddinsIDs[int(
                article['articleID']) % hashSize]:
            #logger.info ("[{}] Embeddings for article with title {} is already created".format(processName, article['title']))
            continue

        if createNewFile:
            locationEmbeddins = open(
                './outputs/locationEmbeddins_{}_{}.txt'.format(
                    processName, str(n)), 'w')
            createNewFile = False

        logger.info("[{}]: Parsing article {}: {}".format(
            processName, str(n), article['title']))

        locationsInArticle = []
        for section_title, section_text in zip(article['section_titles'],
                                               article['section_texts']):

            # Remove wiki markups and HTML tags
            section_text = unwiki.loads(section_text, compress_spaces=True)
            section_text = re.sub(r'<.*?>', '', section_text)

            # Remove parethesis
            section_text = re.sub("[($@*&?].*[$)@*&?]", "", section_text)

            # Tokenize into sentences
            senteces_in_section = sent_tokenize(section_text)

            # Perform Named entity recoginition at a sentence level:
            for sentence in senteces_in_section:
                signal.signal(signal.SIGALRM, handler)
                signal.alarm(10)
                try:
                    listOfLocations = nerObj.getListOfLocationInSentece(
                        sentence)
                except:
                    continue
                signal.alarm(0)
                locationsInArticle.extend(listOfLocations)

        n += 1
        locationEmbeddins.write("{}\t{}\t{}\n".format(
            article['articleID'], article['title'],
            ";".join(locationsInArticle)))

        # Create new file every 10000 interations (just in case the script crash in the middle)
        if n % 1000 == 0:
            locationEmbeddins.close()
            createNewFile = True

    print("Exiting " + processName)
コード例 #29
0
ファイル: __init__.py プロジェクト: abeusher/unwiki
 def testHeadline(self):
     self.assertEqual(unwiki.loads('=== Head ==='), ' Head ')
     self.assertEqual(unwiki.loads('=== Head ===\nText'), ' Head \nText')